# Document Retrieve - Indexer (Built-in)

Prep: create a blob folder `docs` before running the indexer.

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SearchField,  
    VectorSearch,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    AzureOpenAIEmbeddingSkill,
    FieldMapping,
    IndexProjectionMode,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataSourceType,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    SearchIndexerSkillset,
    SplitSkill,
)

In [None]:
import os

search_endpoint = os.getenv("AZSCH_ENDPOINT")  
credential = AzureKeyCredential(os.environ["AZSCH_KEY"])
index_name = os.environ["AZSCH_INDEX_NAME"]

api_key = os.environ["AZURE_OPENAI_KEY"]
azure_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

storage_connection_string = os.environ["AZURE_STORAGE_CONNECTION_STRING"]
# docs
storage_container = os.environ["AZURE_STORAGE_CONTAINER"]

local_folder = "docs"

In [4]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=search_endpoint, credential=credential)
fields = [
    SearchableField(name="chunk_id", key=True, analyzer_name="keyword", sortable=True),
    SimpleField(name="parent_id", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="title"),
    SearchableField(name="chunk", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")  
]

# indexer
indexer_client = SearchIndexerClient(search_endpoint, credential)

data_source_connections = indexer_client.get_data_source_connections()
indexer_client.create_data_source_connection(
    data_source_connection=SearchIndexerDataSourceConnection(
        name=index_name, 
        type=SearchIndexerDataSourceType.AZURE_BLOB,
        connection_string=storage_connection_string,
        container=SearchIndexerDataContainer(name=storage_container)))

vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        )
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",
            vectorizer_name="vectorizer"
        )
    ],
    vectorizers=[
      AzureOpenAIVectorizer(
        vectorizer_name="vectorizer",
        kind="azureOpenAI",
        parameters = AzureOpenAIVectorizerParameters(
            resource_url=azure_endpoint,
            api_key=api_key,
            deployment_name="text-embedding-ada-002",
            model_name="text-embedding-ada-002",
        ),
      )
    ]
)  

semantic_config = SemanticConfiguration(  
    name="semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="chunk")]  
    ),  
)

# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config]) 

# Create the search index
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 sample-aoai-docs created


## Indexer

In [5]:
skillsets = indexer_client.get_skillsets()
print(f"Creating skillset: {index_name}")
indexer_client.create_skillset(
    skillset=SearchIndexerSkillset(
        name=index_name,
        skills=[
            SplitSkill(
                text_split_mode="pages",
                context="/document",
                maximum_page_length=2000,
                page_overlap_length=500,
                inputs=[InputFieldMappingEntry(name="text", source="/document/content")],
                outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")]),
            AzureOpenAIEmbeddingSkill(
                context="/document/pages/*",
                resource_url=azure_endpoint,
                api_key=api_key,
                deployment_name="text-embedding-ada-002",
                model_name="text-embedding-ada-002",
                dimensions=1536,
                inputs=[InputFieldMappingEntry(name="text", source="/document/pages/*")],
                outputs=[OutputFieldMappingEntry(name="embedding", target_name="text_vector")])
        ],
        index_projection=SearchIndexerIndexProjection(
            selectors=[
                SearchIndexerIndexProjectionSelector(
                    target_index_name=index_name,
                    parent_key_field_name="parent_id",
                    source_context="/document/pages/*",
                    mappings=[
                        InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
                        InputFieldMappingEntry(name="vector", source="/document/pages/*/text_vector"),
                        InputFieldMappingEntry(name="title", source="/document/metadata_storage_name")
                    ]
                )
            ],
            parameters=SearchIndexerIndexProjectionsParameters(
                projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
            )
        )))

indexers = indexer_client.get_indexers()
indexer_client.create_indexer(
    indexer=SearchIndexer(
        name=index_name,
        data_source_name=index_name,
        skillset_name=index_name,
        target_index_name=index_name,        
        field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]
    )
)

Creating skillset: sample-aoai-docs


<azure.search.documents.indexes.models._models.SearchIndexer at 0x1084eba90>

## Upload data

In [6]:
#indexer_client = SearchIndexerClient(service_endpoint, credential)
indexer_name=index_name

from azure.storage.blob import BlobServiceClient

blob_client = BlobServiceClient.from_connection_string(
    storage_connection_string,
    max_single_put_size=4 * 1024 * 1024
)

container_client = blob_client.get_container_client(storage_container)
if not container_client.exists():
    container_client.create_container()

existing_blobs = [blob.name for blob in container_client.list_blobs()]

# Open each file in local(`/data``) folder
for file in os.scandir(local_folder):
    with open(file.path, "rb") as opened_file:
        filename = os.path.basename(file.path)
        print("Uploading blob for file: %s", filename)
        blob_client = container_client.upload_blob(filename, opened_file, overwrite=True)

    # Start the indexer
    indexer_client.run_indexer(indexer_name)
    print("Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.")

Uploading blob for file: %s assistants-studio.md
Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.
Uploading blob for file: %s advanced-prompt-engineering.md
Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.
Uploading blob for file: %s assistants-reference.md
Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.
Uploading blob for file: %s assistants-reference-threads.md
Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.
Uploading blob for file: %s assistants-reference-messages.md
Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.
Uploading blob for file: %s azure-search.md
Indexer started. Any unindexed blobs should be indexed in a few minutes, check the Azure Portal for status.
Uploading blob for file: %s

## Query

In [7]:
from azure.search.documents.models import VectorizableTextQuery, VectorQuery, VectorizedQuery

In [8]:
from colorama import Fore, Back, Style

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)

def azsch_embed_query(query):
    #vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
    vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="vector", exhaustive=True)

    results = search_client.search(  
        search_text=None,  
        vector_queries=[vector_query],
        select=["title", "chunk", "parent_id", "chunk_id"],
        top=10 # for limiting text search
    ) 

    for result in results:  
        print((Fore.RED if (result['@search.score'] < 0.8) else Fore.GREEN) + f"{result['@search.score']:.10f}" 
              + Style.RESET_ALL + f": {result['title']} - {result['parent_id']}, {result['chunk_id']}")  

In [9]:
%%time
azsch_embed_query("what is Assistant API?")

[32m0.8456522000[0m: assistants.md - aHR0cHM6Ly9pa2FwcHN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2RvY3MvYXNzaXN0YW50cy5tZA2, fc9946dbd1bc_aHR0cHM6Ly9pa2FwcHN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2RvY3MvYXNzaXN0YW50cy5tZA2_pages_0
[32m0.8219723700[0m: assistant.md - aHR0cHM6Ly9pa2FwcHN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2RvY3MvYXNzaXN0YW50Lm1k0, 301bd3bea9ce_aHR0cHM6Ly9pa2FwcHN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2RvY3MvYXNzaXN0YW50Lm1k0_pages_0
[32m0.8212635000[0m: assistants.md - aHR0cHM6Ly9pa2FwcHN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2RvY3MvYXNzaXN0YW50cy5tZA2, fc9946dbd1bc_aHR0cHM6Ly9pa2FwcHN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2RvY3MvYXNzaXN0YW50cy5tZA2_pages_1
CPU times: user 10.5 ms, sys: 6.65 ms, total: 17.1 ms
Wall time: 985 ms
