# Azure AI Search: vector search, step by step

## Setup API client


In [None]:
import os

import dotenv
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient


dotenv.load_dotenv()

search_creds = AzureKeyCredential(os.getenv("AZURE_SEARCH_API_KEY"))
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=search_creds)
AZURE_SEARCH_TINY_INDEX = "envision-qa-index"

## Search a tiny index

### Create index

In [None]:
from azure.search.documents.indexes.models import (
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
    CorsOptions,
    HnswAlgorithmConfiguration,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchProfile
)

index = SearchIndex(
    name=AZURE_SEARCH_TINY_INDEX,
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=600), 
    fields=[
                SimpleField(name="id", type=SearchFieldDataType.String, key=True,searchable=True, filterable=True, sortable=True, facetable=True),
                SearchableField(name="question", filterable=True, sortable=True, facetable=True,type=SearchFieldDataType.String, analyzer_name="zh-Hans.microsoft"),
                SearchableField(name="answer", filterable=True, sortable=True, facetable=True,type=SearchFieldDataType.String, analyzer_name="zh-Hans.microsoft"),
                SearchableField(name="applicationType", filterable=True, sortable=True, facetable=True,type=SearchFieldDataType.String, analyzer_name="zh-Hans.microsoft"),
                SearchField(name="question_embedding", 
                            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                            hidden=False, 
                            searchable=True, 
                            filterable=False, 
                            sortable=False, 
                            facetable=False,
                            vector_search_dimensions=1536, 
                            vector_search_profile_name="azureOpenAIHnswProfile"),
                SearchField(name="answer_embedding", 
                            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                            hidden=False, 
                            searchable=True, 
                            filterable=False, 
                            sortable=False, 
                            facetable=False,
                            vector_search_dimensions=1536, 
                            vector_search_profile_name="azureOpenAIHnswProfile")
            ],
    semantic_search=SemanticSearch(
                configurations=[
                    SemanticConfiguration(
                        name="default",
                        prioritized_fields=SemanticPrioritizedFields(
                            title_field=SemanticField(field_name="question"),
                            content_fields=[
                                SemanticField(field_name="answer")
                            ],
                            keywords_fields=[
                                SemanticField(field_name="question")
                            ]
                        ),
                    )
                ]
            ),
    vector_search=VectorSearch(
                algorithms=[HnswAlgorithmConfiguration(name="myHnsw")],
                profiles=[VectorSearchProfile(name="azureOpenAIHnswProfile",algorithm_configuration_name="myHnsw",vectorizer="azureOpenAIVectorizer")],
                vectorizers=[
                    AzureOpenAIVectorizer(
                        name="azureOpenAIVectorizer",
                        azure_open_ai_parameters=AzureOpenAIParameters(
                            resource_uri=os.getenv("aoai_resource_uri"),
                            deployment_id=os.getenv("aoai_embedding_deployment_name"),
                            model_name=os.getenv("aoai_embedding_model_name"),
                            api_key=os.getenv("aozai_api_key")))
                ]
            )
)

index_client.create_or_update_index(index)

In [None]:
search_client = SearchClient(
    endpoint=AZURE_SEARCH_ENDPOINT,
    index_name=AZURE_SEARCH_TINY_INDEX,
    credential=search_creds,
)

from openai import AzureOpenAI

search_creds = AzureKeyCredential(os.getenv("AZURE_SEARCH_API_KEY"))
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=search_creds)


aoai_client = AzureOpenAI(
        api_key=os.getenv("aozai_api_key"),  
        api_version=os.getenv("aoai_llm_api_version"),
        azure_endpoint=os.getenv("aoai_resource_uri")
    )

In [None]:
def generate_embeddings_with_client(texts, client):
    """Generate embeddings for a list of texts using Azure OpenAI client."""
    if not texts or all(not text or text.strip() == "" for text in texts):
        return [[] for _ in range(len(texts))]
    
    try:
        # Filter out empty texts to avoid API errors
        non_empty_texts = [text for text in texts if text and text.strip() != ""]
        indices = [i for i, text in enumerate(texts) if text and text.strip() != ""]
        
        if not non_empty_texts:
            return [[] for _ in range(len(texts))]
        
        # Get embeddings for non-empty texts
        response = client.embeddings.create(
            input=non_empty_texts,
            model=os.getenv("aoai_embedding_model_name")
        )
        
        # Reconstruct the result list with empty embeddings for empty texts
        result = [[] for _ in range(len(texts))]
        for idx, embedding_data in zip(indices, response.data):
            result[idx] = embedding_data.embedding
            
        return result
    except Exception as e:
        print(f"Error generating embeddings: {str(e)}")
        return [[] for _ in range(len(texts))]

In [None]:
def upload_document_into_azure_search(documents,batch_size:int=100):
    successful_uploads = 0
    failed_uploads = 0

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        try:
            upload_result = search_client.upload_documents(batch)
            succeeded = sum(1 for result in upload_result if result.succeeded)
            successful_uploads += succeeded
            failed_uploads += len(batch) - succeeded
            print(f"Batch {i//batch_size + 1}: {succeeded}/{len(batch)} documents succeeded")
        except Exception as e:
            print(f"Failed to upload batch {i//batch_size + 1}: {str(e)}")
            failed_uploads += len(batch)

    # Display final results
    print("\nUpload Summary:")
    print(f"- Total documents: {len(documents)}")
    print(f"- Successfully uploaded: {successful_uploads}")
    print(f"- Failed: {failed_uploads}")

In [None]:
# 知识库---人工生成.xlsx 
import pandas as pd
excel_path = "/home/azureuser/slm-fine-tune-private-domain-kb-generation/raw_documents/excel/知识库---人工生成.xlsx"
df = pd.read_excel(excel_path)
# Convert all column headers to lowercase
df.columns = [col.lower().strip() for col in df.columns]


def prepare_kb_human_gen_documents(df):
    # Ensure the DataFrame is not empty
    if df.empty:
        raise ValueError("The DataFrame is empty. Please provide a valid DataFrame.")

    # Check for required columns
    required_columns = ["question", "answer", "metadata"]
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following required columns are missing: {', '.join(missing_columns)}")

    # Prepare documents for upload
    documents = []
    total_rows = len(df)

    # Prepare batches of questions and answers for efficient embedding generation
    batch_size = 20  # Azure OpenAI can handle multiple inputs in one API call
    for i in range(0, total_rows, batch_size):
        batch_end = min(i + batch_size, total_rows)
        batch_df = df.iloc[i:batch_end]
        
        # Prepare lists of questions and answers
        questions = [str(row["question"]) if not pd.isna(row["question"]) else "" for _, row in batch_df.iterrows()]
        answers = [str(row["answer"]) if not pd.isna(row["answer"]) else "" for _, row in batch_df.iterrows()]
        
        # Generate embeddings in batch
        question_embeddings = generate_embeddings_with_client(questions, aoai_client)
        answer_embeddings = generate_embeddings_with_client(answers, aoai_client)
        
        print(f"Processed documents {i+1} to {batch_end} (batch {i//batch_size + 1})")
        
        # Create documents with embeddings
        for j, (_, row) in enumerate(batch_df.iterrows()):
            idx = i + j
            document = {
                "id": f"kb_human_gen_{idx}",
                "question": str(row["question"]) if not pd.isna(row["question"]) else "",
                "answer": str(row["answer"]) if not pd.isna(row["answer"]) else "",
                "applicationType": str(row["metadata"]) if not pd.isna(row["metadata"]) else "",
                "question_embedding": question_embeddings[j],
                "answer_embedding": answer_embeddings[j]
            }
            documents.append(document)

    print(f"Prepared {len(documents)} documents with embeddings")
    return documents


# get documents from excel
kb_human_gen_documents = prepare_kb_human_gen_documents(df)
# upload documents into azure search
upload_document_into_azure_search(kb_human_gen_documents, batch_size=100)


### Insert a few documents with tiny vectors

In [21]:
# 知识库---ai生产.xlsx 
import pandas as pd
excel_path = "/home/azureuser/slm-fine-tune-private-domain-kb-generation/raw_documents/excel/知识库---ai生产.xlsx"
df = pd.read_excel(excel_path)
# Convert all column headers to lowercase
df.columns = [col.lower().strip() for col in df.columns]

print(df.columns)


def prepare_kb_ai_gen_documents(df):
    # Ensure the DataFrame is not empty
    if df.empty:
        raise ValueError("The DataFrame is empty. Please provide a valid DataFrame.")

    # Check for required columns
    required_columns = ["question", "answer"]
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following required columns are missing: {', '.join(missing_columns)}")

    # Prepare documents for upload
    documents = []
    total_rows = len(df)

    # Prepare batches of questions and answers for efficient embedding generation
    batch_size = 20  # Azure OpenAI can handle multiple inputs in one API call
    for i in range(0, total_rows, batch_size):
        batch_end = min(i + batch_size, total_rows)
        batch_df = df.iloc[i:batch_end]
        
        # Prepare lists of questions and answers
        questions = [str(row["question"]) if not pd.isna(row["question"]) else "" for _, row in batch_df.iterrows()]
        answers = [str(row["answer"]) if not pd.isna(row["answer"]) else "" for _, row in batch_df.iterrows()]
        
        # Generate embeddings in batch
        question_embeddings = generate_embeddings_with_client(questions, aoai_client)
        answer_embeddings = generate_embeddings_with_client(answers, aoai_client)
        
        print(f"Processed documents {i+1} to {batch_end} (batch {i//batch_size + 1})")
        
        # Create documents with embeddings
        for j, (_, row) in enumerate(batch_df.iterrows()):
            idx = i + j
            document = {
                "id": f"kb_ai_gen_{idx}",
                "question": str(row["question"]) if not pd.isna(row["question"]) else "",
                "answer": str(row["answer"]) if not pd.isna(row["answer"]) else "",
                "applicationType": "",
                "question_embedding": question_embeddings[j],
                "answer_embedding": answer_embeddings[j]
            }
            documents.append(document)

    print(f"Prepared {len(documents)} documents with embeddings")
    return documents


# get documents from excel
kb_ai_gen_documents = prepare_kb_ai_gen_documents(df)
# upload documents into azure search
upload_document_into_azure_search(kb_ai_gen_documents, batch_size=100)


Index(['question', 'answer'], dtype='object')
Processed documents 1 to 20 (batch 1)
Processed documents 21 to 40 (batch 2)
Processed documents 41 to 60 (batch 3)
Processed documents 61 to 80 (batch 4)
Processed documents 81 to 100 (batch 5)
Processed documents 101 to 120 (batch 6)
Processed documents 121 to 140 (batch 7)
Processed documents 141 to 160 (batch 8)
Processed documents 161 to 180 (batch 9)
Processed documents 181 to 200 (batch 10)
Processed documents 201 to 220 (batch 11)
Processed documents 221 to 240 (batch 12)
Processed documents 241 to 260 (batch 13)
Processed documents 261 to 280 (batch 14)
Processed documents 281 to 300 (batch 15)
Processed documents 301 to 320 (batch 16)
Processed documents 321 to 340 (batch 17)
Processed documents 341 to 360 (batch 18)
Processed documents 361 to 380 (batch 19)
Processed documents 381 to 400 (batch 20)
Processed documents 401 to 420 (batch 21)
Processed documents 421 to 440 (batch 22)
Processed documents 441 to 460 (batch 23)
Process

### Search using vector similarity

In [None]:
# 已完成票单记录.xlsx
import pandas as pd
excel_path = "/home/azureuser/slm-fine-tune-private-domain-kb-generation/raw_documents/excel/已完成票单记录.xlsx"
df = pd.read_excel(excel_path)
# Convert all column headers to lowercase
df.columns = [col.lower().strip() for col in df.columns]

print(df.columns)


def prepare_kb_ticket_documents(df):
    # Ensure the DataFrame is not empty
    if df.empty:
        raise ValueError("The DataFrame is empty. Please provide a valid DataFrame.")

    # Check for required columns
    required_columns = ["问题描述", "处理方式"]
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following required columns are missing: {', '.join(missing_columns)}")

    # Prepare documents for upload
    documents = []
    total_rows = len(df)

    # Prepare batches of questions and answers for efficient embedding generation
    batch_size = 20  # Azure OpenAI can handle multiple inputs in one API call
    for i in range(0, total_rows, batch_size):
        batch_end = min(i + batch_size, total_rows)
        batch_df = df.iloc[i:batch_end]
        
        # Prepare lists of questions and answers
        questions = [str(row["问题描述"]) if not pd.isna(row["问题描述"]) else "" for _, row in batch_df.iterrows()]
        answers = [str(row["处理方式"]) if not pd.isna(row["处理方式"]) else "" for _, row in batch_df.iterrows()]
        
        # Generate embeddings in batch
        question_embeddings = generate_embeddings_with_client(questions, aoai_client)
        answer_embeddings = generate_embeddings_with_client(answers, aoai_client)
        
        print(f"Processed documents {i+1} to {batch_end} (batch {i//batch_size + 1})")
        
        # Create documents with embeddings
        for j, (_, row) in enumerate(batch_df.iterrows()):
            idx = i + j
            document = {
                "id": f"kb_ticket_{idx}",
                "question": str(row["问题描述"]) if not pd.isna(row["问题描述"]) else "",
                "answer": str(row["处理方式"]) if not pd.isna(row["处理方式"]) else "",
                "applicationType": "",
                "question_embedding": question_embeddings[j],
                "answer_embedding": answer_embeddings[j]
            }
            documents.append(document)

    print(f"Prepared {len(documents)} documents with embeddings")
    return documents


# get documents from excel
kb_ai_gen_documents = prepare_kb_ai_gen_documents(df)
# upload documents into azure search
upload_document_into_azure_search(kb_ai_gen_documents, batch_size=100)


In [None]:
from azure.search.documents.models import VectorizedQuery

r = search_client.search(search_text=None, vector_queries=[
    VectorizedQuery(vector=[-2, -1, -1], k_nearest_neighbors=3, fields="embedding")])
for doc in r:
    print(f"id: {doc['id']}, score: {doc['@search.score']}")

## Search a larger index

In [None]:
import azure.identity
import dotenv
import openai

dotenv.load_dotenv()

# Initialize Azure search variables
AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

token_provider = azure.identity.get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding

In [None]:
AZURE_SEARCH_FULL_INDEX = "gptkbindex"
search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_FULL_INDEX, credential=azure_credential)

search_query = "learning about underwater activities"
search_vector = get_embedding(search_query)
r = search_client.search(search_text=None, top=5, vector_queries=[
    VectorizedQuery(vector=search_vector, k_nearest_neighbors=5, fields="embedding")])
for doc in r:
    content = doc["content"].replace("\n", " ")[:150]
    print(f"Score: {doc['@search.score']:.5f}\tContent:{content}")