Package Installation

In [7]:
! pip install azure-search-documents==11.6.0b4 --quiet
! pip install azure-identity --quiet
! pip install openai --quiet

Set API Keys

In [8]:
from azure.core.credentials import AzureKeyCredential

AZURE_SEARCH_SERVICE_URL: str = "<YOUR_AZURE_SEARCH_SERVICE_URL>"
AZURE_OPENAI_SERVICE_URL: str = "<YOUR_AZURE_OPENAI_SERVICE_URL>"
credential = AzureKeyCredential("<YOUR_AZURE_SEARCH_SERVICE_API_KEY>")
AZURE_OPENAI_SERVICE_KEY: str = "<YOUR_AZURE_OPENAI_SERVICE_KEY>"
INDEX_NAME: str = "vectest"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT: str = "<AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME>"
EMBEDDING_MODEL_DIMENSIONS = 1024
EMBEDDING_MODEL_NAME: str = "text-embedding-3-large"

Generate Embeddings

In [9]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
import json
import os

openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")

client = AzureOpenAI(
    azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
    api_version="2024-06-01",
    azure_endpoint=AZURE_OPENAI_SERVICE_URL,
    api_key=AZURE_OPENAI_SERVICE_KEY,
    azure_ad_token_provider=token_provider if not AZURE_OPENAI_SERVICE_KEY else None
)

path = os.path.join('television-data.json')
with open(path, 'r', encoding='utf-8') as file:
    input_data = json.load(file)

titles = [item['title'] for item in input_data]
content = [item['content'] for item in input_data]
title_response = client.embeddings.create(input=titles, model=EMBEDDING_MODEL_NAME, dimensions=EMBEDDING_MODEL_DIMENSIONS)
title_embeddings = [item.embedding for item in title_response.data]
content_response = client.embeddings.create(input=content, model=EMBEDDING_MODEL_NAME, dimensions=EMBEDDING_MODEL_DIMENSIONS)
content_embeddings = [item.embedding for item in content_response.data]

for i, item in enumerate(input_data):
    title = item['title']
    content = item['content']
    item['titleVector'] = title_embeddings[i]
    item['contentVector'] = content_embeddings[i]

output_path = os.path.join('docVectors.json')

with open(output_path, "w") as f:
    json.dump(input_data, f)

Setup Fields

In [10]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters
)

index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_SERVICE_URL, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=EMBEDDING_MODEL_DIMENSIONS, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=EMBEDDING_MODEL_DIMENSIONS, vector_search_profile_name="myHnswProfile"),
]

Configure the Vector Search

In [11]:
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="myVectorizer"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myVectorizer",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=AZURE_OPENAI_SERVICE_URL,
                deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
                model_name=EMBEDDING_MODEL_NAME,
                api_key=AZURE_OPENAI_SERVICE_KEY
            )
        )
    ]
)

Configure the Semantic Search

In [12]:
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[SemanticField(field_name="category")],
        content_fields=[SemanticField(field_name="content")]
    )
)

semantic_search = SemanticSearch(default_configuration_name = "my-semantic-config",  configurations=[semantic_config])

Create the Search Index

In [13]:
index = SearchIndex(name=INDEX_NAME, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 vectest created


Upload to Service

In [14]:
from azure.search.documents import SearchClient
import json

output_path = os.path.join('docVectors.json')

with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_URL, index_name=INDEX_NAME, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

Uploaded 50 documents


Vector Search

In [15]:
from azure.search.documents.models import VectorizedQuery

query = "intergalactic"  
embedding = client.embeddings.create(input=query, model=EMBEDDING_MODEL_NAME, dimensions=EMBEDDING_MODEL_DIMENSIONS).data[0].embedding

vector_query = VectorizedQuery(vector=embedding, k_nearest_neighbors=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")

Title: The Expanse
Score: 0.5985822
Content: The Expanse is a space opera that offers a realistic and gripping portrayal of humanity's colonization of the solar system in the 23rd century. The series follows a disparate group of characters, including a detective, a ship's officer, and a United Nations executive, as they uncover a vast conspiracy that threatens the fragile peace between Earth, Mars, and the asteroid belt. The Expanse is known for its complex characters, intricate political intrigue, and its attention to scientific detail. The show has been praised for its world-building, its exploration of themes such as power, survival, and the nature of humanity, and its ability to deliver intense and thought-provoking storytelling.
Category: Science Fiction

Title: Star Trek: The Next Generation
Score: 0.5757152
Content: Star Trek: The Next Generation is a sci-fi series that follows the adventures of a new generation of Starfleet officers on the U.S.S. Enterprise-D. The series is set

Cleanup

In [16]:
index_client.delete_index(index)