# Document Retrieve - Index (Manual)

Quick reference for Azure AI Search 11.06 Document:

- [SearchClient](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python#azure-search-documents-searchclient-search)
- [Models](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.models?view=azure-python)
- [Index](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes?view=azure-python)


> Warning this sample is not latest and it needs to modify some parameters!

Azure AI Search integrated vectorization Sample [link](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb)

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os

api_key = os.environ["AZURE_OPENAI_KEY"]

In [3]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SearchField,  
    VectorSearch,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)

In [4]:
import os
service_endpoint = os.getenv("AZSCH_ENDPOINT")  
credential = AzureKeyCredential(os.environ["AZSCH_KEY"])

# update index name `aoai-md-index`
index_name = os.environ["AZSCH_INDEX_NAME"]

In [5]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="chunk", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="parent_id", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="chunk_id", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")  
]

vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        )
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",
            vectorizer_name="vectorizer"
        )
    ],
    vectorizers=[
      AzureOpenAIVectorizer(
        vectorizer_name="vectorizer",
        kind="azureOpenAI",
        parameters = AzureOpenAIVectorizerParameters(
            resource_url="https://ilkimaoi2.openai.azure.com",
            api_key=api_key,
            deployment_name="text-embedding-ada-002",
            model_name="text-embedding-ada-002",
        ),
      )
    ]
)  

semantic_config = SemanticConfiguration(  
    name="semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="chunk")]  
    ),  
)

# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config]) 

# Create the search index
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 test-aoai-docs created


## Ingest

In [6]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = os.environ['AZURE_OPENAI_KEY'],
  api_version = "2024-10-01-preview",
  azure_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']
)

def generate_embeddings(text):
    response = client.embeddings.create(
        input = text,
        model= "text-embedding-ada-002"
    )
    return response.data[0].embedding

In [8]:
import pandas as pd
df = pd.read_pickle('./aoai-docs.pkl')
df

Unnamed: 0,title,chunk,parent_id,chunk_id
0,How to use content filters (preview) with Azur...,# How to configure content filters with Azure ...,content-filters.md,0
1,How to use content filters (preview) with Azur...,Customers are responsible for ensuring that ap...,content-filters.md,1
2,How to use content filters (preview) with Azur...,":::image type=""content"" source=""../media/conte...",content-filters.md,2
3,dotnet-new-application.md,### Create a new .NET Core application\n\nIn a...,dotnet-new-application.md,0
4,Customize a model with Azure OpenAI Service an...,## Prerequisites\n\n- Read the [When to use Az...,fine-tuning-python.md,0
...,...,...,...,...
433,Use your image data with Azure OpenAI Service ...,Your blob storage should contain image files a...,use-your-image-data.md,2
434,Use your image data with Azure OpenAI Service ...,## Turn on CORS\n\nIf CORS isn't already turne...,use-your-image-data.md,3
435,use-your-data-go.md,[!INCLUDE [Set up required variables](./use-yo...,use-your-data-go.md,0
436,use-your-data-go.md,1. Install the following Go packages:\n\n ``...,use-your-data-go.md,1


In [8]:
import json
import time, math
from tqdm.auto import tqdm

count = 0
batch_size = 100 # 100 RPS
for i in tqdm(range(0, len(df), batch_size)):
    t_s = time.time()
    # set end position of batch
    i_end = min(i+batch_size, len(df))
    df.loc[i:i_end, ('vector')] = df.loc[i:i_end, ('chunk')].apply(lambda row: generate_embeddings(row))

    t_f = time.time()
    sleep_s = 60 - math.floor(t_f - t_s)

    time.sleep(sleep_s if (sleep_s > 0) else 0)


  0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
df

Unnamed: 0,title,chunk,parent_id,chunk_id,vector
0,How to use content filters (preview) with Azur...,# How to configure content filters with Azure ...,content-filters.md,0,"[0.029494330286979675, -0.006135216914117336, ..."
1,How to use content filters (preview) with Azur...,Customers are responsible for ensuring that ap...,content-filters.md,1,"[0.03570285066962242, -0.007272129878401756, -..."
2,How to use content filters (preview) with Azur...,":::image type=""content"" source=""../media/conte...",content-filters.md,2,"[0.030622564256191254, -0.0014172176597639918,..."
3,dotnet-new-application.md,### Create a new .NET Core application\n\nIn a...,dotnet-new-application.md,0,"[-0.0026705467607825994, 0.0049848007038235664..."
4,Customize a model with Azure OpenAI Service an...,## Prerequisites\n\n- Read the [When to use Az...,fine-tuning-python.md,0,"[-0.0018976553110405803, -0.010764623060822487..."
...,...,...,...,...,...
433,Use your image data with Azure OpenAI Service ...,Your blob storage should contain image files a...,use-your-image-data.md,2,"[-0.01049772184342146, -0.006959473714232445, ..."
434,Use your image data with Azure OpenAI Service ...,## Turn on CORS\n\nIf CORS isn't already turne...,use-your-image-data.md,3,"[-0.011906027793884277, -0.010828031226992607,..."
435,use-your-data-go.md,[!INCLUDE [Set up required variables](./use-yo...,use-your-data-go.md,0,"[-0.0073547265492379665, 0.006100298836827278,..."
436,use-your-data-go.md,1. Install the following Go packages:\n\n ``...,use-your-data-go.md,1,"[-0.0024404972791671753, 0.015535353682935238,..."


In [10]:
df.to_pickle('aoai-docs-vector.pkl')

In [10]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(A, B):
    return np.dot(A, B) / (norm(A) * norm(B))

In [11]:
query = "what is assistant api?"
vector = generate_embeddings(query)

df["cosine_sim"] = df['vector'].apply(lambda x: cosine_similarity(x, vector))
result = df.sort_values("cosine_sim", ascending=False).head(3)

result

Unnamed: 0,title,chunk,parent_id,chunk_id,vector,cosine_sim
81,Azure OpenAI Service Assistant API concepts,# Azure OpenAI Assistants API (Preview)\n\nAss...,assistants.md,0,"[-0.01865777187049389, 0.010454908944666386, 0...",0.807845
283,Quickstart - Getting started with Azure OpenAI...,# Quickstart: Get started using Azure OpenAI A...,assistants-quickstart.md,0,"[-0.0010632603662088513, 0.0028020874597132206...",0.787679
211,How to create Assistants with Azure OpenAI Ser...,# Getting started with Azure OpenAI Assistants...,assistant.md,0,"[-0.014795494265854359, -5.406972923083231e-05...",0.78195


In [12]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

In [13]:
import json
from tqdm.auto import tqdm

count = 0
batch_size = 20
for i in tqdm(range(0, len(df), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(df))
    
    documents = df[i:i_end].apply(
        lambda row: {'id': str(row.name), 
                     'title': row['title'], 
                     'chunk': row['chunk'],
                     'parent_id': row['parent_id'],
                     'chunk_id': str(row['chunk_id']),
                     #'vector': generate_embeddings(row['chunk'])
                     'vector': row['vector']
                    }, axis=1).to_list()
    
    result = search_client.upload_documents(documents)  

  0%|          | 0/22 [00:00<?, ?it/s]

In [14]:
from azure.search.documents.models import VectorizableTextQuery, VectorQuery, VectorizedQuery

In [None]:
from colorama import Fore, Back, Style

def azsch_embed_query(query):
    vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
    #vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="vector", exhaustive=True)

    results = search_client.search(  
        search_text=None,  
        vector_queries=[vector_query],
        select=["title", "chunk", "parent_id", "chunk_id"],
        top=10 # for limiting text search
    ) 

    for result in results:  
        print((Fore.RED if (result['@search.score'] < 0.8) else Fore.GREEN) + f"{result['@search.score']:.10f}" 
              + Style.RESET_ALL + f": {result['title']} - {result['parent_id']}, {result['chunk_id']}")  

In [20]:
%%time
azsch_embed_query('assistant api?')

[32m0.8167502000[0m: Azure OpenAI Service Assistant API concepts - assistants.md, 0
[32m0.8136052500[0m: Quickstart - Getting started with Azure OpenAI Assistants (Preview) - assistants-quickstart.md, 0
[32m0.8045602400[0m: How to create Assistants with Azure OpenAI Service - assistant.md, 0
CPU times: user 9.55 ms, sys: 0 ns, total: 9.55 ms
Wall time: 941 ms
