# Prepare Azure AI Search with Vector Search

This script demonstrates how to use the Azure AI Search SDK to create an index over Azure AdventureLT database. This is used to create a search index for the Azure SQL promptflow demo.


### Prerequisites

To run the code, you need to install the packages in the requirements.txt file. You can do this by running the following command:

```python
pip install -r requirements.txt
```

Copyright (c) Microsoft Corporation.
Licensed under the MIT license.

In [1]:
import pandas as pd
import pyodbc
import json
from openai import AzureOpenAI
from tqdm.auto import tqdm
from dotenv import load_dotenv
import os
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
import requests

  from .autonotebook import tqdm as notebook_tqdm


#### Locate your .env file (should be base of repo)

In [2]:
load_dotenv('../../../../.env',override=True)

True

In [3]:
azure_openai_endpoint = os.getenv("AZURE_OPENAI_API_BASE")
azure_openai_key = os.getenv('AZURE_OPENAI_API_KEY')
azure_openai_version = os.getenv('AZURE_OPENAI_API_VERSION')
azure_openai_gpt_deployment = os.getenv('AZURE_OPENAI_API_GPT_DEPLOYMENT')
azure_openai_emb_deployment = os.getenv('AZURE_OPENAI_API_EMB_DEPLOYMENT')
azure_search_key = os.getenv('AZURE_SEARCH_KEY')
azure_search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT')
azure_search_index = os.getenv('AZURE_SEARCH_INDEX')
azure_sql_server = os.getenv('AZURE_SQL_SERVER')
azure_sql_database = os.getenv('AZURE_SQL_DATABASE_NAME')
azure_sql_user = os.getenv('AZURE_SQL_USER')
azure_sql_pass = os.getenv('AZURE_SQL_PASSWORD')
connectionString = f'Driver={{ODBC Driver 18 for SQL Server}};Server=tcp:{azure_sql_server}.database.windows.net,1433;Database={azure_sql_database};Uid={azure_sql_user};Pwd={azure_sql_pass};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'

In [4]:
# init openai service to create embeddings

client = AzureOpenAI(
    azure_endpoint = azure_openai_endpoint,
    api_key = azure_openai_key,
    api_version = azure_openai_version,
)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text,  embedding_model_deploy_id=azure_openai_emb_deployment):
    response = client.embeddings.create(
        input=text, model=embedding_model_deploy_id)
    embeddings = response.data[0].embedding
    return embeddings

In [6]:
# SQL QUERY to get products details from the database
sqlQuery="""SELECT PC.Name AS ProductCategoryName, SP.ProductId, SP.Name, SP.ProductNumber, SP.Color, SP.ListPrice, SP.Size, SP.ProductCategoryID, SP.ProductModelID, PD.ProductDescriptionID, PD.Description
from [SalesLT].[Product] SP
INNER JOIN SalesLT.ProductCategory PC ON PC.ProductCategoryID = SP.ProductCategoryID
INNER JOIN [SalesLT].[ProductModelProductDescription] PMPD ON PMPD.ProductModelID = SP.ProductModelID
INNER JOIN [SalesLT].[ProductDescription] PD ON PD.ProductDescriptionID = PMPD.ProductDescriptionID
WHERE PMPD.Culture = 'en'"""

# Connect to the database and execute the query to get the data for indexing
conn = pyodbc.connect(connectionString)
cursor = conn.cursor()
queryResults = pd.DataFrame()
try:
    cursor.execute(sqlQuery)
    records = cursor.fetchall()
    queryResults = pd.DataFrame.from_records(records, columns=[col[0] for col in cursor.description])
except Exception as e:
    print(f"connection could not be established: {e}")
finally:
    cursor.close()

queryResultsJson = json.loads(queryResults.to_json(orient='records'))
print(f"Total records to be indexed: {len(queryResultsJson)}, the maximum length of the description field is {queryResults['Description'].str.len().max()} characters.")

Total records to be indexed: 294, the maximum length of the description field is 221 characters.


In [7]:
# generate embeddings for the product name and product description fields
print("Generating embeddings for the product name and product description fields.")
for i in tqdm(range(len(queryResultsJson))):
    doc = queryResultsJson[i]
    queryResultsJson[i]['DescriptionVector'] = generate_embeddings(doc['Description'].strip())
    queryResultsJson[i]['ProductCategoryNameVector'] = generate_embeddings(doc['ProductCategoryName'])

Generating embeddings for the product name and product description fields.


  0%|          | 0/294 [00:00<?, ?it/s]

100%|██████████| 294/294 [00:55<00:00,  5.27it/s]


### Create your seach index

In [8]:
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,    
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)


In [9]:
# init Azure Cognitive Search Service
index_name = azure_search_index #"promptflow-demo-product-description"
service_endpoint = azure_search_endpoint
key = azure_search_key
credential = AzureKeyCredential(key)

In [10]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SearchField(name="ProductCategoryName", type=SearchFieldDataType.String, searchable = True, filterable=True, facetable=False, retrievable = True),
    SearchField(name="ProductId", type=SearchFieldDataType.String, searchable = True, filterable=True, key=True),
    SearchField(name="Name", type=SearchFieldDataType.String, key=False, searchable = True, sortable=False, filterable=True, facetable=False, retrievable = True),
    SearchField(name="ProductNumber", type=SearchFieldDataType.String, searchable = True),
    SearchField(name="Color", type=SearchFieldDataType.String, searchable = True),
    SimpleField(name="ListPrice", type=SearchFieldDataType.Double, searchable = False, filterable=True),
    SimpleField(name="Size", type=SearchFieldDataType.String, searchable = False, filterable=True),
    SimpleField(name="ProductCategoryID", type=SearchFieldDataType.Int32, searchable = True, filterable=True),
    SimpleField(name="ProductModelID", type=SearchFieldDataType.Int32, searchable = True, filterable=True),
    SimpleField(name="ProductDescriptionID", type=SearchFieldDataType.Int32, searchable = True, filterable=True, ),
    SearchField(name="Description", type=SearchFieldDataType.String,
                    filterable=True, searchable = True),
    SearchField(name="DescriptionVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="ProductCategoryNameVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_emb_deployment,  
                api_key=azure_openai_key,  
            ),  
        ),  
    ],  
)  

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="Name"),
        keywords_fields=[SemanticField(field_name="ProductCategoryName")],
        content_fields=[SemanticField(field_name="Description")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

promptflow-demo-product-description created


### Index and upload embedded documents into vector store

In [11]:
for i in range(len(queryResultsJson)):
    # since productId is the key field, it needs to be a string
    queryResultsJson[i]['ProductId'] = str(queryResultsJson[i]['ProductId'])

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(queryResultsJson)
print(f"Uploaded {len(queryResultsJson)} documents")

Uploaded 294 documents


### Vector Search Example

In [13]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

### Use SDK Search Client (use AI Search built-in embedding generation)

In [15]:
import pandas as pd
# Hybrid Search
query = "Do you have something that can help me exercise at home?"  
top_k = 5

search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=501, fields="ProductCategoryNameVector, DescriptionVector", exhaustive=True)
            # k_nearest_neighbors should be set to 50 in order to boost the relevance of hybrid search
            # Increasing the vector recall set size from 1 to 50 in hybrid search benefits relevance by
            # improving the diversity of vector query results that will be considered by RRF, ensuring a more comprehensive representation
            # of the data results and more robustness to varying similarity scores or closely related similarity scores.
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["ProductId, ProductCategoryName, Name, ProductNumber, Color, ListPrice, Size, ProductCategoryID, ProductModelID, ProductDescriptionID, Description"],
    top=top_k
)  
  
data = [[result["ProductId"], result["Name"], result["Description"], result["@search.score"]] for result in results]
print(pd.DataFrame(data, columns=["id", "title", "content", "@search.score"]))


    id                   title  \
0  875         Racing Socks, L   
1  874         Racing Socks, M   
2  935       LL Mountain Pedal   
3  879  All-Purpose Bike Stand   
4  754        Road-450 Red, 58   

                                             content  @search.score  
0  Thin, lightweight and durable with cuffs that ...       0.046183  
1  Thin, lightweight and durable with cuffs that ...       0.045365  
2  Expanded platform so you can ride in any shoes...       0.038821  
3  Perfect all-purpose bike stand for working on ...       0.037868  
4  A true multi-sport bike that offers streamline...       0.034137  


### Or REST API (manually generate embeddings of query)

In [18]:
query = "Do you have something that can help me exercise at home?"
top_k = 5
api_version = "2023-11-01"

headers = {
        'Content-Type': 'application/json',
        'api-key': key,
    }
params = {
    'api-version': api_version,
}
body = {
    "vectorQueries": [
        {
            "kind": "vector",
            "vector": generate_embeddings(query),
            "fields": "ProductCategoryNameVector, DescriptionVector",
            "k": top_k

        },
    ],
    "select": "ProductId, ProductCategoryName, Name, ProductNumber, Color, ListPrice, Size, ProductCategoryID, ProductModelID, ProductDescriptionID, Description",
    "top": top_k,
}
response = requests.post(
    f"{service_endpoint}/indexes/{index_name}/docs/search", headers=headers, params=params, json=body)
#response_json = response.json()['value']

In [19]:
response.json()['value']

[{'@search.score': 0.01666666753590107,
  'ProductCategoryName': 'Socks',
  'ProductId': '710',
  'Name': 'Mountain Bike Socks, L',
  'ProductNumber': 'SO-B909-L',
  'Color': 'White',
  'ListPrice': 9.5,
  'Size': 'L',
  'ProductCategoryID': 27,
  'ProductModelID': 18,
  'ProductDescriptionID': 1189,
  'Description': 'Combination of natural and synthetic fibers stays dry and provides just the right cushioning.'},
 {'@search.score': 0.01666666753590107,
  'ProductCategoryName': 'Bike Stands',
  'ProductId': '879',
  'Name': 'All-Purpose Bike Stand',
  'ProductNumber': 'ST-1401',
  'Color': None,
  'ListPrice': 159.0,
  'Size': None,
  'ProductCategoryID': 31,
  'ProductModelID': 122,
  'ProductDescriptionID': 1201,
  'Description': 'Perfect all-purpose bike stand for working on your bike at home. Quick-adjusting clamps and steel construction.'},
 {'@search.score': 0.016393441706895828,
  'ProductCategoryName': 'Socks',
  'ProductId': '709',
  'Name': 'Mountain Bike Socks, M',
  'Product