In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

## Basic call to Azure OpenAI API

In [2]:
from azure.identity import DefaultAzureCredential
from openai import AzureOpenAI

# Create an AzureOpenAI client instance
azure_openai_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("api_version"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Call the Azure OpenAI API
response = azure_openai_client.chat.completions.create(
    model=os.getenv("deployment"),
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant that helps people find information.",
        },
        {"role": "user", "content": "Who were the founders of Microsoft?"},
    ],
)
print(response.choices[0].message.content)

Microsoft was founded by Bill Gates and Paul Allen. They established the company on April 4, 1975, in Albuquerque, New Mexico.


In [3]:
import json
import wget
import pandas as pd
import zipfile
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryType,
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    ComplexField,
    CorsOptions,
    ScoringProfile,
    SearchResourceEncryptionKey,
)
from typing import List

from dotenv import load_dotenv
import os
from azure.identity import AzureCliCredential
from azure.search.documents import SearchClient
load_dotenv(override=True)
# Azure Search configuration
SEARCH_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME")
credential = AzureCliCredential()
# Create Search client instance
search_client = SearchClient(
    endpoint=SEARCH_ENDPOINT, index_name=INDEX_NAME, credential=credential
)

  from pandas.core import (


In [4]:
embeddings_url = "https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip"

# The file is ~700 MB so this will take some time
#wget.download(embeddings_url)

In [5]:
with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip", "r") as zip_ref:
    zip_ref.extractall("./data")

In [48]:
article_df = pd.read_csv("./data/vector_database_wikipedia_articles_embedded.csv")

# Read vectors from strings back into a list using json.loads
article_df["title_vector"] = article_df.title_vector.apply(json.loads)
article_df["content_vector"] = article_df.content_vector.apply(json.loads)
article_df["vector_id"] = article_df["vector_id"].apply(str)
article_df.head()

Unnamed: 0,id,url,title,text,title_vector,content_vector,vector_id
0,1,https://simple.wikipedia.org/wiki/April,April,April is the fourth month of the year in the J...,"[0.001009464613161981, -0.020700545981526375, ...","[-0.011253940872848034, -0.013491976074874401,...",0
1,2,https://simple.wikipedia.org/wiki/August,August,August (Aug.) is the eighth month of the year ...,"[0.0009286514250561595, 0.000820168002974242, ...","[0.0003609954728744924, 0.007262262050062418, ...",1
2,6,https://simple.wikipedia.org/wiki/Art,Art,Art is a creative activity that expresses imag...,"[0.003393713850528002, 0.0061537534929811954, ...","[-0.004959689453244209, 0.015772193670272827, ...",2
3,8,https://simple.wikipedia.org/wiki/A,A,A or a is the first letter of the English alph...,"[0.0153952119871974, -0.013759135268628597, 0....","[0.024894846603274345, -0.022186409682035446, ...",3
4,9,https://simple.wikipedia.org/wiki/Air,Air,Air refers to the Earth's atmosphere. Air is a...,"[0.02224554680287838, -0.02044147066771984, -0...","[0.021524671465158463, 0.018522677943110466, -...",4


## Create an encryted index with Azure AI Search Python SDK

In [None]:
credential = AzureCliCredential() # Authenticates by requesting a token from the Azure CLI. This requires being logged in via "az login", and will use the CLI's currently logged in identity.
# Get the service endpoint
endpoint = os.getenv('AZURE_SEARCH_SERVICE_ENDPOINT')
index_client = SearchIndexClient(endpoint, credential=credential)


# Define the fields for the index
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String),
    SimpleField(name="vector_id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="url", type=SearchFieldDataType.String),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="text", type=SearchFieldDataType.String),
    SearchField(
        name="title_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=1536,
        vector_search_profile_name="my-vector-config",
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=1536,
        vector_search_profile_name="my-vector-config",
    ),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="my-hnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="my-vector-config",
            algorithm_configuration_name="my-hnsw",
        )
    ],
)

# Configure the semantic search configuration
semantic_search = SemanticSearch(
    configurations=[
        SemanticConfiguration(
            name="my-semantic-config",
            prioritized_fields=SemanticPrioritizedFields(
                title_field=SemanticField(field_name="title"),
                keywords_fields=[SemanticField(field_name="url")],
                content_fields=[SemanticField(field_name="text")],
            ),
        )
    ]
)

customer_managed_key = SearchResourceEncryptionKey(
    key_name=os.getenv("KEYVAULT_KEY_NAME"),
    key_version="",
    vault_uri=os.getenv("KEYVAULT_URI"),
)

# Create the search index with the vector search and semantic search configurations
index = SearchIndex(
    name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search,
    encryption_key=customer_managed_key,
)
# Create or update the index
result = index_client.create_or_update_index(index)
print(f"{result.name} created")

In [49]:
from azure.core.exceptions import HttpResponseError


# Convert the 'id' and 'vector_id' columns to string so one of them can serve as our key field
article_df["id"] = article_df["id"].astype(str)
article_df["vector_id"] = article_df["vector_id"].astype(str)
# Convert the DataFrame to a list of dictionaries
documents = article_df.to_dict(orient="records")

# Create a SearchIndexingBufferedSender
batch_client = SearchIndexingBufferedSender(
    endpoint=endpoint,
    index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    credential=credential)

try:
    # Add upload actions for all documents in a single call
    batch_client.upload_documents(documents=documents)

    # Manually flush to send any remaining documents in the buffer
    batch_client.flush()
except HttpResponseError as e:
    print(f"An error occurred: {e}")
finally:
    # Clean up resources
    batch_client.close()

print(f"Uploaded {len(documents)} documents in total")

Uploaded 25000 documents in total


In [46]:
# Example function to generate document embedding
def generate_embeddings(text, model):
    # Generate embeddings for the provided text using the specified model
    embeddings_response = azure_openai_client.embeddings.create(model=model, input=text)
    # Extract the embedding data from the response
    embedding = embeddings_response.data[0].embedding
    return embedding


first_document_content = documents[0]["text"]
print(f"Content: {first_document_content[:100]}")

content_vector = generate_embeddings(
    first_document_content, model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
)
print(content_vector)

Content: A generator usually means a machine that makes electrical energy. It has a generator head with wires
[-0.012902016751468182, -0.005176974926143885, 0.00759246526286006, 0.003938510548323393, -0.022169481962919235, 0.017720064148306847, 0.005316019058227539, -0.01672411896288395, -0.009073448367416859, -0.03512970358133316, 0.018159830942749977, 0.052047837525606155, -0.025338398292660713, 0.010683775879442692, 0.008200380019843578, 0.03036986105144024, 0.028300363570451736, 0.0007902081124484539, 0.012481650337576866, -0.017500178888440132, -0.003618385177105665, 0.012688600458204746, -0.023191295564174652, -0.006596520077437162, -0.004219832830131054, 0.0228550024330616, 0.019543807953596115, -0.02235056273639202, 0.012248832732439041, -0.0051672738045454025, 0.01387856062501669, 0.01519786473363638, -0.01339998934417963, -0.025622952729463577, -0.014072575606405735, -0.003951444756239653, 0.007980495691299438, -0.02112179808318615, -0.015301339328289032, 0.004362110514193773

# 🔍 Comparing Search Techniques in Azure AI Search

In this section we will explore the three types of search methods:

- **Keyword (Lexical) Search**
- **Pure Vector (Semantic) Search**
- **Hybrid Search (Keyword + Vector)**

Each section includes real-world use case analysis, pros/cons, and example code.

In [66]:
# ===========================================================
# 🔍 Keyword Search: Lexical Matching Based on Query Terms
# ===========================================================

#  USE THIS WHEN:
# - Users search for known terms or specific phrases (e.g., "GDPR", "Samsung S23").
# - Precision matters more than context or similarity.
# - Data is structured or contains domain-specific keywords.

#  AVOID THIS WHEN:
# - Queries are vague or use natural language.
# - You want semantic or contextual understanding of intent.

from azure.search.documents import SearchClient
import os

search_client = SearchClient(
    endpoint=endpoint,
    index_name=os.getenv("AZURE_SEARCH_INDEX_NAME"),
    credential=credential
)

query = "Marie curie"

results = search_client.search(
    search_text=query,
    select=["title", "text", "url"]
)

print("🔍 Keyword Search Results:\n")
for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"URL: {result['url']}\n")

🔍 Keyword Search Results:

Title: List of Nobel Prize winners in Physics
Score: 6.6174364
URL: https://simple.wikipedia.org/wiki/List%20of%20Nobel%20Prize%20winners%20in%20Physics

Title: April
Score: 2.3694603
URL: https://simple.wikipedia.org/wiki/April

Title: October 31
Score: 0.8782403
URL: https://simple.wikipedia.org/wiki/October%2031

Title: Feminism
Score: 0.5291682
URL: https://simple.wikipedia.org/wiki/Feminism

Title: 1980
Score: 0.45989814
URL: https://simple.wikipedia.org/wiki/1980



In [70]:
# ===========================================================
# 🤖 Vector Search: Semantic Matching Using Embeddings
# ===========================================================

#  USE THIS WHEN:
# - Users phrase queries naturally or vaguely.
# - You want to retrieve semantically relevant content.
# - Content is long-form and not necessarily keyword-optimized.

#  AVOID THIS WHEN:
# - Precision of keywords is important.
# - Your index doesn’t support vector fields.

from azure.search.documents.models import VectorizedQuery


query = "modern art in Europe"

vector_query = VectorizedQuery(
    vector=generate_embeddings(query, model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")),
    k_nearest_neighbors=3,
    fields="content_vector"
)

results = search_client.search(
    search_text=None,
    vector_queries=[vector_query],
    select=["title", "text", "url"]
)

print("🔀 Hybrid Search Results:\n")
for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"URL: {result['url']}\n")


🔀 Hybrid Search Results:

Title: Anna Maria Thelott
Score: 0.8137101
URL: https://simple.wikipedia.org/wiki/Anna%20Maria%20Thelott

Title: Eurasian magpie
Score: 0.8107782
URL: https://simple.wikipedia.org/wiki/Eurasian%20magpie

Title: 2013
Score: 0.8093003
URL: https://simple.wikipedia.org/wiki/2013



In [98]:
# ===========================================================
# 🔀 Hybrid Search: Combining Lexical and Semantic Matching
# ===========================================================

#  USE THIS WHEN:
# - You want both keyword precision AND semantic recall.
# - Users use mixed query styles: specific and vague.
# - You need to balance flexibility and precision.

#  AVOID THIS WHEN:
# - You only need keyword control.
# - Your index lacks embedding vectors.

query = "modern art in Europe"

vector_query = VectorizedQuery(
    vector=generate_embeddings(query, model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")),
    k_nearest_neighbors=3,
    fields="content_vector"
)

results = search_client.search(
    search_text=query,
    vector_queries=[vector_query],
    select=["title", "text", "url"]
)

print("🔀 Hybrid Search Results:\n")
for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"URL: {result['url']}\n")

🔀 Hybrid Search Results:

{'title': 'Eurasian magpie', 'url': 'https://simple.wikipedia.org/wiki/Eurasian%20magpie', 'text': 'The Eurasian magpie or common magpie (Pica pica) is a bird that lives in Europe, much of Asia, and northwest Africa. It is one of several birds in the crow family named as magpies.\n\nIn Europe, "magpie" is used by English speakers to call the European magpie; it is the only magpie in Europe outside the Iberian Peninsula.\n\nRelated pages\n\nGeneral licences under the Wildlife & Countryside Act 1981\n\nOther websites \n European Magpie videos  on the Internet Bird Collection\n Avibase \n\nCorvids', '@search.score': 0.03177805617451668, '@search.reranker_score': None, '@search.highlights': None, '@search.captions': None}
Title: Eurasian magpie
Score: 0.03177805617451668
URL: https://simple.wikipedia.org/wiki/Eurasian%20magpie

{'title': '2013', 'url': 'https://simple.wikipedia.org/wiki/2013', 'text': '2013 (twenty thirteen) (MMXIII) was . It was the first year si

{'title': 'Pterodactylus', 'url': 'https://simple.wikipedia.org/wiki/Pterodactylus', 'text': 'Pterodactylus was a small pterosaur. It lived in the later Jurassic period, at the same time as many dinosaurs. Pterodactylus is typical of the short-tailed pterosaurs. The sub-order continued to the end of the Cretaceous, but Pterodactylus is only known from the latest Jurassic.  \nThe first specimens were found at Solnhofen in Bavaria, Germany. Also found in these strata was the famous Archaeopteryx. \n\nThese carbonate strata formed in warm lagoons. Pterodactylus may have lived on small islands in the lagoon, or on the coast. Perhaps it lived further inland and was blown here in a storm. The high salinity would support little life, so the pterosaurs would have fed in some other area.\n\nThe first specimen was found in the 18th century, and a description published in 1784. Later, Georges Cuvier worked out that it was a flying reptile. A juvenile pterosaur came to light in 1817, and showed cl

{'title': 'Flood', 'url': 'https://simple.wikipedia.org/wiki/Flood', 'text': "A flood is an overflow of water. This is most commonly due to an overflowing river, a dam break, snowmelt, or heavy rainfall. Less commonly happening are tsunamis, storm surge.\nThe most deadly flooding was in 1931 in China and killed between 2,000,000 and 4,000,000 people. The Kerala flood in India was another flood that has destroyed people's houses.\n\nDuring a flood, people try to move themselves and their most precious belongings to higher ground quickly. The process of leaving homes in search of a safe place is called a flood evacuation.\n\nPollution of drinking water \n\nDuring a flood there is plenty of water logging and overflow of water,  it is mostly polluted and not safe to drink. If people drink this dirty water, they may suffer from diseases such as typhoid and cholera, hepatitis and other such diseases. People can get ready to survive a flood by filling many containers with fresh and clean drin

{'title': 'Cantata', 'url': 'https://simple.wikipedia.org/wiki/Cantata', 'text': "A cantata is a type of singing which is done accompanied by an instrument(s). By contrast, a cappella specifically refers to unaccompanied singing. The word(cantata) etymologically comes from the Italian word “cantare” which meant “to sing”.  The word “cantata” was used mainly in the 17th and 18th century to describe music with religious words that were sung by a choir or by soloists or both, accompanied by instruments. The most famous cantatas are those by Johann Sebastian Bach. Nearly all his cantatas are sacred(written for church services). Very often he used Lutheran hymn tunes (chorales) for the first and last movements. In between there are movements for solo singers: recitatives and arias. A famous example is Bach's cantata no 80 which is based on the chorale “Ein feste Burg ist unser Gott(“A safe stronghold our God is still”). The whole work is about being safe in the hands of God. This is what th

###  Summary Table

| Search Type       | Description                                 | Best Use Case Example                           | Avoid When...                                 |
|-------------------|---------------------------------------------|--------------------------------------------------|------------------------------------------------|
| Keyword Search     | Exact term matching (TF-IDF, inverted index) | Legal docs, product names, structured filters    | Semantic queries or vague natural language     |
| Vector Search      | Embedding-based semantic similarity         | Blogs, unstructured content, FAQ search          | Needing strict keyword matching                |
| Hybrid Search      | Combines keyword and semantic matching      | General-purpose web search, documentation sites  | Index doesn’t support vector fields            |

In [75]:
!pip install --upgrade semantic-kernel





[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [110]:
# Create an AzureOpenAI client instance
azure_openai_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("api_version"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)


# The query is sent to the search engine, but it's also passed in the prompt

query = "What nobel price did Werner Heisenber win?"

vector_query = VectorizedQuery(
    vector=generate_embeddings(query, model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")),
    k_nearest_neighbors=3,
    fields="content_vector"
)

results = search_client.search(
    searc
    search_text=query,
    select=["title", "text", "url"]
)


# The prompt includes the query and the source, which are specified further down in the code.


GROUNDED_PROMPT="""
You are a friendly assistant that answer questions.
Answer the query using only the sources provided below in a friendly and concise bulleted manner.
Answer ONLY with the facts listed in the list of sources below.
If there isn't enough information below, say you don't know.
Do not generate answers that don't use the sources below.
Query: {query}
Sources:\n{sources}
"""


# Retrieve the selected fields from the search index related to the question
sources_formatted = "\n".join([f'{document["text"]}:{document["title"]}:{document["url"]}' for document in results])

# Call the Azure OpenAI API

response = azure_openai_client.chat.completions.create(
    messages=[
        
        {
            "role": "user",
            "content": GROUNDED_PROMPT.format(query=query, sources=sources_formatted)
        }
    ],
    model=os.getenv("deployment")
)

print(response.choices[0].message.content)

- Werner Heisenberg won the Nobel Prize in Physics.

