### Create a search index

In [109]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
AZURE_SEARCH_SERVICE_ENDPOINT = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
AZURE_SEARCH_ADMIN_CREDENTIAL = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()
index_name = "pull-cosmosdb-nosql-chunk-index"

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small")
azure_openai_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME", "text-embedding-3-small")
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))

# note: The chat deployment should support tool use
# To learn more, please see
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-and-gpt-4-turbo-models
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-35
azure_openai_chat_deployment = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "gpt-4o-mini")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-07-01-preview")

### Create a search index

In [110]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex,
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchIndexerSkillset,
    SearchIndexerIndexProjectionSelector,  
    SearchIndexerIndexProjections,  
    SearchIndexerIndexProjectionsParameters, 
    IndexProjectionMode,
    AzureOpenAIEmbeddingSkill
)

# Create a search index
# note: You must adjust these fields based on your CSV Schema.
# There is no chunking of the description or title fields in this sample.
# There is a separate AzureSearch_DocumentKey for the key automatically generated by the indexer
# Learn more at https://learn.microsoft.com/en-us/azure/search/search-howto-index-json-blobs


## ['title', 'header', 'content', 'summary', 'title_vector','content_vector', 'unique_id']
       
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, credential=AZURE_SEARCH_ADMIN_CREDENTIAL)  
fields = [  
    SearchField(name="id",  key=True, type=SearchFieldDataType.String,analyzer_name="keyword"),
    SearchField(name="title", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="header", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="section", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="unique_id", type=SearchFieldDataType.String, sortable=False, filterable=True, facetable=False, analyzer_name="keyword"),
    #SearchField(name="summary", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),    
    #SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=azure_openai_model_dimensions, vector_search_profile_name="myHnswProfile"),
    SearchField(name="SectionVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=azure_openai_model_dimensions, vector_search_profile_name="myHnswProfile"),
]  

# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(name="myHnsw"),
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        )
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,
                model_name=azure_openai_model_name,
                api_key=azure_openai_key,
            ),
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="section")]
    )
)


# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created") 

pull-cosmosdb-nosql-chunk-index created


## Create or update CosmosDB data source connector on Azure AI Search

In [111]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SoftDeleteColumnDeletionDetectionPolicy
)


AZURE_COSMOS_DB_ENDPOINT= os.environ["AZURE_COSMOS_DB_ENDPOINT"]
AZURE_COSMOS_DB_KEY= os.environ["AZURE_COSMOS_DB_KEY"]
AZURE_COSMOS_DB_DATABASE= os.environ["AZURE_COSMOS_DB_DATABASE"]
AZURE_COSMOS_DB_CONTAINER= os.environ["AZURE_COSMOS_DB_CONTAINER"]
AZURE_COSMOS_DB_CONN= os.environ["AZURE_COSMOS_DB_CONN"]
AZURE_COSMOS_DB_DATASOURCE_NAME  =  os.environ["AZURE_COSMOS_DB_DATASOURCE_NAME"]


# Create a data source
# NOTE: To remove records from a search index, add a column to the row "IsDeleted" set to "True". The next indexer run will remove this record
# To learn more please visit https://learn.microsoft.com/en-us/azure/search/search-howto-index-one-to-many-blobs
indexer_client = SearchIndexerClient(AZURE_SEARCH_SERVICE_ENDPOINT, AZURE_SEARCH_ADMIN_CREDENTIAL)
container = SearchIndexerDataContainer(name=AZURE_COSMOS_DB_CONTAINER)
data_source_connection = SearchIndexerDataSourceConnection(
    name=AZURE_COSMOS_DB_DATASOURCE_NAME,
    type="cosmosdb",
    connection_string=AZURE_COSMOS_DB_CONN,
    container=container,
    data_deletion_detection_policy=SoftDeleteColumnDeletionDetectionPolicy(soft_delete_column_name="IsDeleted", soft_delete_marker_value="True")
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'searchdemocdbnosql' created or updated


## Create a skillset

In [112]:
# Create a skillset  
skillset_name = f"{index_name}-skillset"
  
split_skill = SplitSkill(
        description="Split skill to chunk documents",
        text_split_mode="pages",
        context="/document",
        maximum_page_length=400,
        page_overlap_length=50,
        inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
        outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
    )

title_embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate title embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    model_name=azure_openai_model_name,
    dimensions=azure_openai_model_dimensions,
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="SectionVector")  
    ],  
)

index_projections = SearchIndexerIndexProjections(  
    selectors=[
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,
            parent_key_field_name="unique_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="section", source="/document/pages/*"),
                InputFieldMappingEntry(name="SectionVector", source="/document/pages/*/SectionVector"),
                InputFieldMappingEntry(name="title", source="/document/title"),
                InputFieldMappingEntry(name="header", source="/document/header"),
                InputFieldMappingEntry(name="unique_id", source="/document/unique_id"),
            ],  
        )
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS    
    ),
)  

skills = [split_skill]

skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=skills,
    index_projections=index_projections,
)
  
indexer_client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created") 

pull-cosmosdb-nosql-chunk-index-skillset created


### Create an indexer

In [113]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping,
    FieldMappingFunction,
    IndexingParameters,
    IndexingParametersConfiguration,
    BlobIndexerParsingMode
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  
indexer_parameters = IndexingParameters(batch_size=None,max_failed_items=0, max_failed_items_per_batch=0)

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings", 
    skillset_name=skillset_name,   
    target_index_name=index_name,  
    data_source_name=data_source.name,
    parameters=indexer_parameters,
)  

indexer_client = SearchIndexerClient(AZURE_SEARCH_SERVICE_ENDPOINT, AZURE_SEARCH_ADMIN_CREDENTIAL)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f'{indexer_name} is created and running. If queries return no results, please wait a bit and try again.')