# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`.

In [None]:
! pip install azure-search-documents --pre
! pip install openai
! pip install python-dotenv

## Import required libraries and environment variables

In [17]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  
  
# Configure environment variables  
load_dotenv()  
model_deployment_name = os.getenv("AZURE_OPENAI_MODEL_NAME")
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
credential = AzureKeyCredential(key)

In [26]:
# removes unwanted characters from the txt file

import os
import codecs

# Path to the directory containing the text files
directory = "../data/txt/DSM/"

# Iterate through the directory
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file in read mode and read its contents
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
            contents = f.read()
        # Convert the encoding to utf-8
        contents = contents.replace('\n\n', ' ').replace('\n', ' ').replace('â€', '').replace('Â', '').replace('©', '').replace('*', '').replace('•', '').replace('*', '').replace('“', '').replace('”', '')
        #contents = contents.encode('utf-8')
        # Open the file in write mode and write the utf-8 encoded contents
        with codecs.open(os.path.join(directory, filename), 'w', encoding='UTF-8') as f:
            f.write(contents)  

In [30]:
def do_summary(context):
    
    messages=[
        {"role": "system", "content": "You are a expert clinical pshychologist who specializing in summarizing exerpts from The Diagnostic and Statistical Manual of Mental Illnesses (DMV-5-TV).  You look for disorders within the exerpts and the diagnostics for each disorder.  Please use all your expertise to approach this task.  Output your content in text format."},
        {"role": "user", "content": f'Summarize the following exerpt from The Diagnostic and Statistical Manual of Mental Illnesses (DMV-5-TV). \n\n CONTEXT: \n\n{context}'}
        ]
        
    response = openai.ChatCompletion.create(
                engine=model_deployment_name,
                messages=messages,
                temperature=1,
        )
    
    return response['choices'][0]['message']['content']

In [32]:
import uuid
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import pandas as pd

# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

# get a UUID - URL safe, Base64
def get_a_uuid():
    return str(uuid.uuid4())

# method to get the token length with the encoding
tokenizer_name = tiktoken.get_encoding("cl100k_base")
tokenizer = tiktoken.get_encoding(tokenizer_name.name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000, # this depends on which model you might use, for example with the 16k GPT models setting this to 8k is reasonable and maybe higher
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
)

#function to return the number of tokens in a string
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    token_integers = encoding.encode(string)
    num_tokens = len(token_integers)

    return num_tokens


In [29]:
# open and read all the txt files and put them into chuncks in a dataframe, this takes the contents of
# the file and splits based on the text splitter.  this needs to be split because of the embeddings
# columns will be title, tokens, content

directory = "../data/txt/DSM"
chunk = {}
txt = []


for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
                text = f.read()
                with open(f"../data/summary/{filename}", 'w', encoding='UTF-8') as s:
                    s.write(do_summary(text))

InvalidRequestError: This model's maximum context length is 32768 tokens. However, your messages resulted in 33473 tokens. Please reduce the length of the messages.

In [33]:
# open the main file and write out summaries based on the chunking defined in the text_splitter

directory = "../data/txt/DSM"
chunk = {}
txt = []


for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
                text = f.read()
                texts = text_splitter.create_documents([text])
                x = 0
                for i in texts:
                     with open(f"../data/summary/{str(x) + filename}", 'w', encoding='UTF-8') as s:
                        s.write(do_summary(i.page_content))
                        x += 1
                        

## Create dataframe with embeddings from txt files in a directory
Read your txt files into a dataframe that can be used to load the index:

In [34]:
# open and read all the txt files and put them into chuncks in a dataframe, this takes the contents of
# the file and splits based on the text splitter.  this needs to be split because of the embeddings
# columns will be title, tokens, content

directory = "../data/txt/DSM"
chunk = {}
txt = []

for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
                text = f.read()
                texts = text_splitter.create_documents([text])
                for i in texts:
                        chunk = {
                                "id": get_a_uuid(),  # generate a random uuid for the document
                                "title": filename[:-4],  # remove the .txt extension from the filename and use this as the title
                                "content": i.page_content,
                                "summary": do_summary(i.page_content),
                                "sourcefile": filename,
                                "content_tokens": num_tokens_from_string(i.page_content, "cl100k_base"),
                                "category": "Anxiety Disorders",
                                "contentVector": generate_embeddings(i.page_content)
                                }
                        txt.append(chunk)

df = pd.DataFrame(txt)
df

Unnamed: 0,id,title,content,summary,sourcefile,content_tokens,category,contentVector
0,4135e984-9ecd-45e5-9e0e-4d2735050cd7,Anxiety Disorders,Anxiety Disorders Anxiety disorders include d...,The excerpt presents a comprehensive overview ...,Anxiety Disorders.txt,3994,Anxiety Disorders,"[-0.021641232073307037, 0.03180353343486786, 0..."
1,a5c045ef-be4c-462b-b907-c16c8cf093af,Anxiety Disorders,"or at least virtual contact with, their key at...",This excerpt can be divided into three section...,Anxiety Disorders.txt,3998,Anxiety Disorders,"[-0.018110305070877075, 0.02612890675663948, 0..."
2,32ce21c4-3ecd-418e-aa01-e5a2a1ca5f86,Anxiety Disorders,phobia must cause clinically significant distr...,The excerpt from the Diagnostic and Statistica...,Anxiety Disorders.txt,3993,Anxiety Disorders,"[-0.026190858334302902, 0.023814618587493896, ..."
3,d223526e-32bb-42dc-b177-65eac879af07,Anxiety Disorders,to a party). Social anxiety among older adults...,The provided text from the Diagnostic and Stat...,Anxiety Disorders.txt,3996,Anxiety Disorders,"[-0.02221773937344551, 0.012617803178727627, 0..."
4,ab66bdc9-095c-4162-916d-dfc9188b1bf9,Anxiety Disorders,"(fewer than four symptoms) attacks, and the nu...",The excerpt presents an in-depth description o...,Anxiety Disorders.txt,3998,Anxiety Disorders,"[-0.016278991475701332, 0.02659684605896473, 0..."
5,c1b2368f-29c1-499e-b8b2-18df32966e99,Anxiety Disorders,"including, but not limited to, dizziness, card...",The excerpt describes Panic Attacks and Agorap...,Anxiety Disorders.txt,3993,Anxiety Disorders,"[-0.02697766199707985, 0.021825220435857773, 0..."
6,dc3c2bd9-6f08-4a63-b1ee-ff6179d72cb4,Anxiety Disorders,"such as a partner, friend, or health professio...",The excerpt discusses two main psychological d...,Anxiety Disorders.txt,3996,Anxiety Disorders,"[-0.0033753656316548586, 0.011218320578336716,..."
7,bd006504-bfef-4c43-a9fc-cccd4c7a5cd7,Anxiety Disorders,reassurance about their performance and other ...,This excerpt discusses various diagnostic cons...,Anxiety Disorders.txt,3994,Anxiety Disorders,"[-0.009039944037795067, 0.0193722415715456, 0...."
8,aad31006-8607-4fa1-b6d6-b994a549bf17,Anxiety Disorders,"or another medical condition), a diagnosis of ...",The excerpt discusses the diagnostic criteria ...,Anxiety Disorders.txt,2195,Anxiety Disorders,"[-0.019852414727211, 0.04183370620012283, 0.02..."


In [36]:
df['summary'][0]

'The excerpt presents a comprehensive overview of various Anxiety Disorders as identified in the Diagnostic and Statistical Manual of Mental Illnesses (DMV-5-TV). These disorders share features of excessive fear and anxiety and related behavioral disturbances. Generally, they persist beyond developmentally appropriate periods and are persistent, often lasting 6 months or more.\n\nFollowing are the disorders and their diagnostics discussed in the excerpt:\n\n1. Separation Anxiety Disorder: This refers to developmentally inappropriate and excessive fear or anxiety concerning separation from those to whom the individual is attached. Symptoms often develop in childhood, but can be expressed throughout adulthood. Diagnosis requires the symptoms to be persistent, lasting at least 4 weeks in children and adolescents and typically 6 months or more in adults. The disturbance must cause clinically significant distress or impairment in social, academic, occupational, or other important areas of f

## Create your search index
Create your search index schema and vector search configuration:

In [37]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="summary", type=SearchFieldDataType.String),
    SearchableField(name="sourcefile", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="summaryVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="dsm-vector-config"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="dsm-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="dsm-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="dsm-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="category"), SemanticField(field_name="sourcefile")],
        prioritized_content_fields=[SemanticField(field_name="content"),SemanticField(field_name="summary")],
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 dsm-vector-index created


## Insert text and embeddings into vector store from data frame
Creates a list called sections from the dataframe to be loaded into the index:

In [38]:
# populate a list with the data we will use to store in the index
import re

def create_sections(df):
    for index, row in df.iterrows():
        yield {
            "id": row["id"],
            "title": row["title"],
            "content": row["content"],
            "summary": row["summary"],
            "sourcefile": row["sourcefile"],
            "category": row["category"],
            "summaryVector": generate_embeddings(row["summary"]),
            "contentVector": row["contentVector"],
            "@search.action": "upload",
        }
        
sections = create_sections(df)

## Load the sections list into the index
Loops thru a list called sections and creates a document ofr each item in the index:

In [39]:
def index_sections(sections):
    print(
        f"Indexing sections into search index '{index_name}'"
    )

    search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)

    i = 0
    batch = []
    for s in sections:
        batch.append(s)
        i += 1
        if i % 1000 == 0:
            results = search_client.upload_documents(documents=batch)
            succeeded = sum([1 for r in results if r.succeeded])
            print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
            batch = []

    if len(batch) > 0:
        results = search_client.upload_documents(documents=batch)
        succeeded = sum([1 for r in results if r.succeeded])
        print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
        
index_sections(sections)

Indexing sections into search index 'dsm-vector-index'
	Indexed 9 sections, 9 succeeded


## Perform a vector similarity search

In [41]:
# Pure Vector Search
query = "types of anxiety disorders"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector = Vector(value=generate_embeddings(query), k=1, fields="summaryVector")
  
results = search_client.search(  
    search_text=None,  
    vectors= [vector],
    select=["title", "content", "category"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  


Title: Anxiety Disorders
Score: 0.88774735
Content: Anxiety Disorders  Anxiety disorders include disorders that share features of excessive fear and anxiety and related behavioral disturbances. Fear is the emotional response to real or perceived imminent threat, whereas anxiety is anticipation of future threat. Obviously, these two states overlap, but they also differ, with fear more often associated with surges of autonomic arousal necessary for fight or flight, thoughts of immediate danger, and escape behaviors, and anxiety more often associated with muscle tension and vigilance in preparation for future danger and cautious or avoidant behaviors. Sometimes the level of fear or anxiety is reduced by pervasive avoidance behaviors. Panic attacks feature prominently within the anxiety disorders as a particular type of fear response. Panic attacks are not limited to anxiety disorders but rather can be seen in other mental disorders as well.  The anxiety disorders differ from one another i

In [None]:
# Pure Vector Search multi-lingual (e.g 'challenger sales model' in French)  
query = "types of anxiety disorders"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector = Vector(value=generate_embeddings(query), k=3, fields="summaryVector")  
  
results = search_client.search(  
    search_text=None,  
    vectors=[vector],
    select=["title", "summary", "category"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Summary: {result['summary']}")  
    print(f"Category: {result['category']}\n")  


## Perform a Cross-Field Vector Search

In [None]:
# Cross-Field Vector Search
query = "types of anxiety disorders"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  
vector = Vector(value=generate_embeddings(query), k=3, fields="summaryVector, contentVector")  
  
results = search_client.search(  
    search_text=None,  
    vectors=[vector],
    select=["title", "summary", "category"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Summary: {result['summary']}")  
    print(f"Category: {result['category']}\n")  


## Perform a Multi-Vector Search

In [None]:
# Multi-Vector Search
query = "challenger sales model"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  
vector1 = Vector(value=generate_embeddings(query), k=3, fields="summaryVector")  
vector2 = Vector(value=generate_embeddings(query), k=3, fields="contentVector")  
  
results = search_client.search(  
    search_text=None,  
    vectors=[vector1, vector2],
    select=["title", "content", "category"],
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  


## Perform a Pure Vector Search with a filter

In [None]:
# Pure Vector Search with Filter
query = "tools for software development"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  
vector = Vector(value=generate_embeddings(query), k=3, fields="contentVector")  

results = search_client.search(  
    search_text=None,  
    vectors=[vector],
    filter="category eq 'Challenger Customer'",
    select=["title", "content", "category"]
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  


## Perform a Hybrid Search

In [None]:
# Hybrid Search
query = "challenger sales model"  
  
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
vector = Vector(value=generate_embeddings(query), k=3, fields="contentVector")  

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["title", "content", "category"],
    top=3
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  


## Perform a Semantic Hybrid Search

In [42]:
# Semantic Hybrid Search
query = "types of anxiety disorders"

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector = Vector(value=generate_embeddings(query), k=3, fields="summaryVector")  

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["title", "summary", "category"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='dsm-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Summary: {result['summary']}")
    print(f"Category: {result['category']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


Semantic Answer: The most frequent additional diagnoses are other anxiety disorders (e.g.,<em> specific phobias, panic disorder, social anxiety disorder), depressive disorders (major depressive disorder), PTSD,</em> and<em> alcohol use disorder.</em>
Semantic Answer Score: 0.81591796875

Title: Anxiety Disorders
Summary: The excerpt discusses two main psychological disorders: Agoraphobia and Generalized Anxiety Disorder.

Agoraphobia is characterized by an individual employing safety behaviors, such as sitting near exits in crowded or enclosed places. It involves fear, anxiety, or avoidance that do not correspond to the actual danger posed by the situation but are considered disproportionate. The fear or avoidance must be persistent and cause significant distress or impairment in social, occupational, or other significant areas of functioning. Associated features include severe forms that can cause individuals to become completely homebound and dependent; depressive symptoms, alcohol, 