# Document Retrieve - Index (Manual)

Quick reference for Azure AI Search 11.06 Document:

- [SearchClient](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.searchclient?view=azure-python#azure-search-documents-searchclient-search)
- [Models](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.models?view=azure-python)
- [Index](https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes?view=azure-python)


> Warning this sample is not latest and it needs to modify some parameters!

Azure AI Search integrated vectorization Sample [link](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb)

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os

api_key = os.environ["AZURE_OPENAI_KEY"]
azure_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']

In [None]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SearchField,  
    VectorSearch,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)

In [None]:
import os
service_endpoint = os.getenv("AZSCH_ENDPOINT")  
credential = AzureKeyCredential(os.environ["AZSCH_KEY"])

# update index name `aoai-md-index`
index_name = os.environ["AZSCH_INDEX_NAME"]

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="chunk", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="parent_id", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="chunk_id", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")  
]

vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        )
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",
            vectorizer_name="vectorizer"
        )
    ],
    vectorizers=[
      AzureOpenAIVectorizer(
        vectorizer_name="vectorizer",
        kind="azureOpenAI",
        parameters = AzureOpenAIVectorizerParameters(
            resource_url=azure_endpoint,
            api_key=api_key,
            deployment_name="text-embedding-ada-002",
            model_name="text-embedding-ada-002",
        ),
      )
    ]
)  

semantic_config = SemanticConfiguration(  
    name="semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="chunk")]  
    ),  
)

# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config]) 

# Create the search index
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

## Ingest

In [None]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = os.environ['AZURE_OPENAI_KEY'],
  api_version = "2024-10-01-preview",
  azure_endpoint = os.environ['AZURE_OPENAI_ENDPOINT']
)

def generate_embeddings(text):
    response = client.embeddings.create(
        input = text,
        model= "text-embedding-ada-002"
    )
    return response.data[0].embedding

In [None]:
import pandas as pd
df = pd.read_pickle('./aoai-docs.pkl')
df

In [None]:
from tqdm import tqdm

# Parameters
batch_size = 20  # Number of rows to process in a batch, adjust as needed

# Function to get embeddings in batches
def get_embeddings_in_batches(df, batch_size, model_name="text-embedding-ada-002"):
    embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        # Prepare the batch
        batch_content = df['chunk'].iloc[i:i + batch_size].tolist()

        # Request embeddings from OpenAI in batch
        response = client.embeddings.create(
            input=batch_content,
            model=model_name
        )
        
        # Extract embeddings
        batch_embeddings = [item.embedding for item in response.data]
        embeddings.extend(batch_embeddings)
    
    return embeddings

In [None]:
df['vector'] = get_embeddings_in_batches(df, batch_size)
df

In [None]:
df.to_pickle('aoai-docs-vector.pkl')

In [None]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(A, B):
    return np.dot(A, B) / (norm(A) * norm(B))

In [None]:
query = "what is Assistant API?"
vector = generate_embeddings(query)

df["cosine_sim"] = df['vector'].apply(lambda x: cosine_similarity(x, vector))
result = df.sort_values("cosine_sim", ascending=False).head(3)

result

In [None]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

In [None]:
count = 0
batch_size = 20
for i in tqdm(range(0, len(df), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(df))
    
    documents = df[i:i_end].apply(
        lambda row: {'id': str(row.name), 
                     'title': row['title'], 
                     'chunk': row['chunk'],
                     'parent_id': row['parent_id'],
                     'chunk_id': str(row['chunk_id']),
                     #'vector': generate_embeddings(row['chunk'])
                     'vector': row['vector']
                    }, axis=1).to_list()
    
    result = search_client.upload_documents(documents)  

In [None]:
from azure.search.documents.models import VectorizableTextQuery, VectorQuery, VectorizedQuery

In [None]:
from colorama import Fore, Back, Style

def azsch_embed_query(query):
    vector_query = VectorizedQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
    #vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="vector", exhaustive=True)

    results = search_client.search(  
        search_text=None,  
        vector_queries=[vector_query],
        select=["title", "chunk", "parent_id", "chunk_id"],
        top=10 # for limiting text search
    ) 

    for result in results:  
        print((Fore.RED if (result['@search.score'] < 0.8) else Fore.GREEN) + f"{result['@search.score']:.10f}" 
              + Style.RESET_ALL + f": {result['title']} - {result['parent_id']}, {result['chunk_id']}")  

In [None]:
%%time
azsch_embed_query("what is Assistant API?")