Package Installation

In [1]:
! pip install azure-search-documents==11.6.0b4 --quiet
! pip install azure-identity --quiet
! pip install openai --quiet

Set API Keys

In [2]:
from azure.core.credentials import AzureKeyCredential

# TODO: Configure your app
AZURE_SEARCH_SERVICE_URL: str = "<YOUR_AZURE_SEARCH_SERVICE_URL>"
AZURE_OPENAI_SERVICE_URL: str = "<YOUR_AZURE_OPENAI_SERVICE_URL>"
credential = AzureKeyCredential("<YOUR_AZURE_SEARCH_SERVICE_API_KEY>")
AZURE_OPENAI_SERVICE_KEY: str = "<YOUR_AZURE_OPENAI_SERVICE_KEY>"
INDEX_NAME: str = "vectest"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT: str = "<AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME>"
EMBEDDING_MODEL_DIMENSIONS = 1024
EMBEDDING_MODEL_NAME: str = "text-embedding-3-large"

Generate Embeddings

In [3]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
import json
import os

openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")

client = AzureOpenAI(
    azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
    api_version="2024-06-01",
    azure_endpoint=AZURE_OPENAI_SERVICE_URL,
    api_key=AZURE_OPENAI_SERVICE_KEY,
    azure_ad_token_provider=token_provider if not AZURE_OPENAI_SERVICE_KEY else None
)

path = os.path.join('text-sample-2.json')
with open(path, 'r', encoding='utf-8') as file:
    input_data = json.load(file)

titles = [item['title'] for item in input_data]
content = [item['content'] for item in input_data]
title_response = client.embeddings.create(input=titles, model=EMBEDDING_MODEL_NAME, dimensions=EMBEDDING_MODEL_DIMENSIONS)
title_embeddings = [item.embedding for item in title_response.data]
content_response = client.embeddings.create(input=content, model=EMBEDDING_MODEL_NAME, dimensions=EMBEDDING_MODEL_DIMENSIONS)
content_embeddings = [item.embedding for item in content_response.data]

for i, item in enumerate(input_data):
    title = item['title']
    content = item['content']
    item['titleVector'] = title_embeddings[i]
    item['contentVector'] = content_embeddings[i]

output_path = os.path.join('docVectors.json')

with open(output_path, "w") as f:
    json.dump(input_data, f)

Setup Fields

In [4]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters
)

index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_SERVICE_URL, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=EMBEDDING_MODEL_DIMENSIONS, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=EMBEDDING_MODEL_DIMENSIONS, vector_search_profile_name="myHnswProfile"),
]


Configure the Vector Search


In [5]:
# TODO: Configure vector search
# vector_search = VectorSearch(
#     algorithms=[
#         HnswAlgorithmConfiguration(
#             name="myHnsw"
#         )
#     ],
#     profiles=[
#         VectorSearchProfile(
#             name="myHnswProfile",
#             algorithm_configuration_name="myHnsw",
#             vectorizer="myVectorizer"
#         )
#     ],
#     vectorizers=[
#         AzureOpenAIVectorizer(
#             name="myVectorizer",
#             azure_open_ai_parameters=AzureOpenAIParameters(
#                 resource_uri=AZURE_OPENAI_SERVICE_URL,
#                 deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
#                 model_name=EMBEDDING_MODEL_NAME,
#                 api_key=AZURE_OPENAI_SERVICE_KEY
#             )
#         )
#     ]
# )

Configure the Semantic Search

In [6]:
# TODO: Configure semantic search
# semantic_config = SemanticConfiguration(
#     name="my-semantic-config",
#     prioritized_fields=SemanticPrioritizedFields(
#         title_field=SemanticField(field_name="title"),
#         keywords_fields=[SemanticField(field_name="category")],
#         content_fields=[SemanticField(field_name="content")]
#     )
# )

semantic_search = SemanticSearch(configurations=[semantic_config])

Create the Search Index

In [None]:
# TODO: Create a search index
# index = SearchIndex(name=INDEX_NAME, fields=fields,
#                     vector_search=vector_search, semantic_search=semantic_search)
# result = index_client.create_or_update_index(index)
# print(f' {result.name} created')

Upload to Service

In [None]:
from azure.search.documents import SearchClient
import json

output_path = os.path.join('docVectors.json')

with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_URL, index_name=INDEX_NAME, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

Vector Search

In [None]:
from azure.search.documents.models import VectorizableTextQuery

query = "funny entertainment"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="contentVector")

# TODO: Perform a hybrid search
# results = search_client.search(  
#     search_text=query,  
#     vector_queries=[vector_query],
#     select=["title", "content", "category"],
#     top=3
# )
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Weighted Vector Search

In [None]:
query = "funny entertainment"  
  
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="contentVector", weight=0.2)

# TODO: Perform a weighted vector search
# results = search_client.search(  
#     search_text=query,  
#     vector_queries=[vector_query],
#     select=["title", "content", "category"],
#     top=3
# )
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n") 

Semantic Hybrid Search

In [None]:
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType

query = "what is friends?"

vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=3, fields="contentVector", exhaustive=True)

# TODO: Perform a semantic hybrid search
# results = search_client.search(  
#     search_text=query,  
#     vector_queries=[vector_query],
#     select=["title", "content", "category"],
#     query_type=QueryType.SEMANTIC, semantic_configuration_name='my-semantic-config', query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
#     top=3
# )

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['content']}")
    print(f"Category: {result['category']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

Cleanup

In [12]:
index_client.delete_index(index)