### Setup

In [None]:
%pip install openai[datalib]
%pip install azure-identity
%pip install azure-search-documents==11.4.0b10

In [None]:

from azure.identity import DefaultAzureCredential
import openai
from openai.embeddings_utils import cosine_similarity, get_embedding
from azure.search.documents import SearchClient
from azure.search.documents.models import RawVectorQuery

# Replace these with your service-specific values, make sure you give your user access to use these resources, or use keys for auth
AZURE_OPENAI_URL = "<<your openai service url>>" # your Azure OpenAI instance
AZURE_SEARCH_SERVICE = "<<your search service url>>" # your Azure Search service
AZURE_SEARCH_TINY_INDEX = "tinyvector" # A small index we create in this notebook with tiny vectors
AZURE_SEARCH_BIGGER_INDEX = "<<your search index>>" # Index with richer content, you can create it with the prep scripts in https://github.com/azure-samples/azure-search-openai-demo/, no need to deploy the whole sample

creds = DefaultAzureCredential()
openai.api_key = creds.get_token("https://cognitiveservices.azure.com/.default").token
openai.api_type = "azure_ad"
openai.api_base = AZURE_OPENAI_URL
openai.api_version = "2022-12-01"

### Vector representations

In [None]:
v = get_embedding("hello, world", engine="embedding")

In [None]:
len(v)

### Document similarity modeled as cosine distance

In [None]:
sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'This recent movie is so good',
              'The new movie is awesome']

embeddings1 = [get_embedding(s, engine="embedding") for s in sentences1]
embeddings2 = [get_embedding(s, engine="embedding") for s in sentences2]

for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_similarity(embeddings1[i], embeddings2[i])))

### Create tiny a vector index

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *

index = SearchIndex(
    name=AZURE_SEARCH_TINY_INDEX, 
    fields=[
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(name="myVector", 
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                    searchable=True, 
                    vector_search_dimensions=3,
                    vector_search_profile="vprofile")
    ],
    vector_search=VectorSearch(
        algorithms=[HnswVectorSearchAlgorithmConfiguration(name="algo", parameters=HnswParameters(metric="cosine"))],
        profiles=[VectorSearchProfile(name="vprofile", algorithm="algo")]
    )
)

index_client = SearchIndexClient(AZURE_SEARCH_SERVICE, credential=creds)
index_client.create_index(index)

### Insert a few documents with tiny vectors

In [None]:
search_client = SearchClient(AZURE_SEARCH_SERVICE, AZURE_SEARCH_TINY_INDEX, credential=creds)
search_client.upload_documents(documents=[
    {"id": "1", "myVector": [1, 2, 3]},
    {"id": "2", "myVector": [1, 1, 3]},
    {"id": "3", "myVector": [4, 5, 6]}])

### Search using vector similarity

In [None]:
r = search_client.search(None, vector_queries=[RawVectorQuery(vector=[2, 2, 3], k=3, fields="myVector")])
for doc in r:
    print(f"id: {doc['id']}, score: {doc['@search.score']}")

### Searching on a real index

In [None]:
search_client = SearchClient(AZURE_SEARCH_SERVICE, AZURE_SEARCH_BIGGER_INDEX, credential=creds)

#### Pure vector search

In [None]:
search_query = "learning about underwater activities"
search_vector = get_embedding(search_query, engine="embedding")
r = search_client.search(None, top=5, vector_queries=[RawVectorQuery(vector=search_vector, k=50, fields="embedding")])
for doc in r:
    content = doc["content"].replace("\n", " ")[:150]
    print(f"score: {doc['@search.score']}. {content}")

#### Keyword search and exact matches

In [None]:
search_query = "compliance@contoso.com"
search_vector = get_embedding(search_query, engine="embedding")
r = search_client.search(None, top=5, vector_queries=[RawVectorQuery(vector=search_vector, k=50, fields="embedding")])
for doc in r:
    if "compliance@contoso.com" in doc["content"]:
        content = doc["content"].replace("\n", " ")[:150]
        print(f"score: {doc['@search.score']}. {content}")

In [None]:
search_query = "compliance@contoso.com"
r = search_client.search(search_query, top=5)
for doc in r:
    if "compliance@contoso.com" in doc["content"]:
        content = doc["content"].replace("\n", " ")[:150]
        print(f"score: {doc['@search.score']}. {content}")

#### Hybrid retrieval

In [None]:
search_query = "compliance@contoso.com" # "learning about underwater activities"
search_vector = get_embedding(search_query, engine="embedding")
r = search_client.search(search_query, top=5, vector_queries=[RawVectorQuery(vector=search_vector, k=50, fields="embedding")])
for doc in r:
    if "compliance" in doc["content"].lower():
        content = doc["content"].replace("\n", " ")[:150]
        print(f"score: {doc['@search.score']}. {content}")

#### Hybrid + Semantic Reranking

In [None]:
search_query = "learning about underwater activities"
search_vector = get_embedding(search_query, engine="embedding")
r = search_client.search(
        search_query,
        top=5, 
        vector_queries=[RawVectorQuery(vector=search_vector, k=50, fields="embedding")],
        query_type="semantic",
        semantic_configuration_name="default",
        query_language="en-us")

for doc in r:
    content = doc["content"].replace("\n", " ")[:150]
    print(f"score: {doc['@search.score']}, reranker: {doc['@search.reranker_score']}. {content}")