# Document Retrieve - Index (Manual)

Quick reference for Azure AI Search 11.06 Document:

- [SearchClient](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python#azure-search-documents-searchclient-search)
- [Models](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.models?view=azure-python)
- [Index](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes?view=azure-python)


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SearchField,  
    VectorSearch,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile
)

In [5]:
import os
service_endpoint = os.getenv("AZSCH_ENDPOINT")  
key = os.getenv("AZSCH_KEY")  
credential = AzureKeyCredential(key)
index_name = 'aoai-docs'

In [6]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="chunk", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="parent_id", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="chunk_id", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")  
]

vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        )
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw"
        )
    ]
)  

# Create the search index
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 aoai-docs created


## Ingest

In [7]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = os.environ['JP_AOAI_KEY'],
  api_version = "2023-05-15",
  azure_endpoint = os.environ['JP_AOAI_ENDPOINT']
)

def generate_embeddings(text):
    response = client.embeddings.create(
        input = text,
        model= "text-embedding-ada-002"
    )
    return response.data[0].embedding

In [8]:
import pandas as pd
df = pd.read_pickle('./aoai-docs.pkl')
df

Unnamed: 0,title,chunk,parent_id,chunk_id
0,Quickstart: Use GPT-4 Turbo with Vision on you...,Use this article to get started using the Azur...,gpt-v-python.md,0
1,Quickstart: Use GPT-4 Turbo with Vision on you...,```console\n python quickstart.py\n ```\...,gpt-v-python.md,1
2,Quickstart: Use Azure OpenAI Service with the ...,[Source code](https://github.com/Azure/azure-s...,chatgpt-javascript.md,0
3,Quickstart: Use Azure OpenAI Service with the ...,Run the script with the following command:\n\n...,chatgpt-javascript.md,1
4,use-your-data-rest.md,[!INCLUDE [Set up required variables](./use-yo...,use-your-data-rest.md,0
...,...,...,...,...
438,How to use function calling with Azure OpenAI ...,To use function calling with the Chat Completi...,function-calling.md,2
439,How to use function calling with Azure OpenAI ...,print(response.choices[0].message.model_dump_j...,function-calling.md,3
440,How to use function calling with Azure OpenAI ...,If you want to describe a function that doesn'...,function-calling.md,4
441,How to use function calling with Azure OpenAI ...,"In the examples, we don't do any validation or...",function-calling.md,5


In [9]:
import json
import time, math
from tqdm.auto import tqdm

count = 0
batch_size = 100 # 100 RPS
for i in tqdm(range(0, len(df), batch_size)):
    t_s = time.time()
    # set end position of batch
    i_end = min(i+batch_size, len(df))
    df.loc[i:i_end, ('vector')] = df.loc[i:i_end, ('chunk')].apply(lambda row: generate_embeddings(row))

    t_f = time.time()
    sleep_s = 60 - math.floor(t_f - t_s)

    time.sleep(sleep_s if (sleep_s > 0) else 0)


  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
df

Unnamed: 0,title,chunk,parent_id,chunk_id,vector
0,Quickstart: Use GPT-4 Turbo with Vision on you...,Use this article to get started using the Azur...,gpt-v-python.md,0,"[-0.004444070626050234, -0.012473619543015957,..."
1,Quickstart: Use GPT-4 Turbo with Vision on you...,```console\n python quickstart.py\n ```\...,gpt-v-python.md,1,"[-0.01605406403541565, -0.015318022109568119, ..."
2,Quickstart: Use Azure OpenAI Service with the ...,[Source code](https://github.com/Azure/azure-s...,chatgpt-javascript.md,0,"[0.004981602542102337, 6.3825718825683e-05, -0..."
3,Quickstart: Use Azure OpenAI Service with the ...,Run the script with the following command:\n\n...,chatgpt-javascript.md,1,"[0.0016307272017002106, 0.013343888334929943, ..."
4,use-your-data-rest.md,[!INCLUDE [Set up required variables](./use-yo...,use-your-data-rest.md,0,"[-0.019076131284236908, 0.007477246690541506, ..."
...,...,...,...,...,...
438,How to use function calling with Azure OpenAI ...,To use function calling with the Chat Completi...,function-calling.md,2,"[-0.021478373557329178, 0.006619121413677931, ..."
439,How to use function calling with Azure OpenAI ...,print(response.choices[0].message.model_dump_j...,function-calling.md,3,"[-0.020235424861311913, 0.009053390473127365, ..."
440,How to use function calling with Azure OpenAI ...,If you want to describe a function that doesn'...,function-calling.md,4,"[-0.013387761078774929, 0.01943673938512802, -..."
441,How to use function calling with Azure OpenAI ...,"In the examples, we don't do any validation or...",function-calling.md,5,"[0.0035275937989354134, 0.02130926586687565, -..."


In [11]:
df.to_pickle('aoai-docs-vector.pkl')

In [12]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(A, B):
    return np.dot(A, B) / (norm(A) * norm(B))

In [14]:
query = "what is assistant api?"
vector = generate_embeddings(query)

df["cosine_sim"] = df['vector'].apply(lambda x: cosine_similarity(x, vector))
result = df.sort_values("cosine_sim", ascending=False).head(3)

result

Unnamed: 0,title,chunk,parent_id,chunk_id,vector,cosine_sim
318,Azure OpenAI Service Assistant API concepts,# Azure OpenAI Assistants API (Preview)\n\nAss...,assistants.md,0,"[-0.018719211220741272, 0.01062846276909113, 0...",0.807857
431,Quickstart - Getting started with Azure OpenAI...,# Quickstart: Get started using Azure OpenAI A...,assistants-quickstart.md,0,"[-0.0012442003935575485, 0.002871643053367734,...",0.787601
305,How to create Assistants with Azure OpenAI Ser...,# Getting started with Azure OpenAI Assistants...,assistant.md,0,"[-0.015064164996147156, 0.00019143604731652886...",0.781962


In [16]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

In [17]:
import json
from tqdm.auto import tqdm

count = 0
batch_size = 20
for i in tqdm(range(0, len(df), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(df))
    
    documents = df[i:i_end].apply(
        lambda row: {'id': str(row.name), 
                     'title': row['title'], 
                     'chunk': row['chunk'],
                     'parent_id': row['parent_id'],
                     'chunk_id': str(row['chunk_id']),
                     #'vector': generate_embeddings(row['chunk'])
                     'vector': row['vector']
                    }, axis=1).to_list()
    
    result = search_client.upload_documents(documents)  

  0%|          | 0/23 [00:00<?, ?it/s]

In [18]:
from azure.search.documents.models import VectorizableTextQuery, VectorQuery, VectorizedQuery

In [19]:
from colorama import Fore, Back, Style

def azsch_embed_query(query):
    vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
    #vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=10, fields="vector", exhaustive=True)

    results = search_client.search(  
        search_text=None,  
        vector_queries=[vector_query],
        select=["title", "chunk", "parent_id", "chunk_id"],
        top=10 # for limiting text search
    ) 

    for result in results:  
        print((Fore.RED if (result['@search.score'] < 0.8) else Fore.GREEN) + f"{result['@search.score']:.10f}" 
              + Style.RESET_ALL + f": {result['title']} - {result['parent_id']}, {result['chunk_id']}")  

In [20]:
%%time
azsch_embed_query('assistant api?')

[32m0.8167673000[0m: Azure OpenAI Service Assistant API concepts - assistants.md, 0
[32m0.8136005000[0m: Quickstart - Getting started with Azure OpenAI Assistants (Preview) - assistants-quickstart.md, 0
[32m0.8045437300[0m: How to create Assistants with Azure OpenAI Service - assistant.md, 0
CPU times: user 17 ms, sys: 86 µs, total: 17 ms
Wall time: 346 ms
