# AI Search with RAG optimization
This document illustrates an example workflow for how to optimize Azure AI Search for RAG use cases to enhance the quality of document search. 

# Objective
See if your RAG app's quality metrics improve using only the capabilities of Azure AI Search
- Enhanced Vector (text-embedding3-large)
- Multivector
- BM25 + Vector
- BM25 + Vector + Semantic Reranking
- BM25 + Vector + Semantic Reranking + Query Rewrite


## Prerequisites
Configure a Python virtual environment for 3.10 or later: 
 1. open the Command Palette (Ctrl+Shift+P).
 1. Search for Python: Create Environment.
 1. select Venv / Conda and choose where to create the new environment.
 1. Select the Python interpreter version. Create with version 3.10 or later.

For a dependency installation, run the code below to install the packages required to run it. 

```bash
# Create a virtual environment
python -m venv venv

# Activate the virtual environment
# On Windows
venv\Scripts\activate

# On macOS/Linux
source venv/bin/activate

pip install -r requirements.txt
```

## Set up your environment
Git clone the repository to your local machine. 

```bash
git clone https://github.com/hyogrin/Azure_OpenAI_samples.git
```

Create an .env file based on the .env-sample file. Copy the new .env file to the folder containing your notebook and update the variables.

## Load environment variables

In [27]:

import os

import json
from openai import AzureOpenAI
import sys
import pandas as pd
import tiktoken
import re
from dotenv import load_dotenv

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizableTextQuery
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    AzureOpenAIVectorizerParameters,
    SemanticField,
    SearchIndex,
    VectorSearchAlgorithmMetric,
    VectorSearchAlgorithmKind,
    HnswParameters,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    ScoringProfile,
    TextWeights
)

from tenacity import retry, wait_random_exponential, stop_after_attempt

load_dotenv(override=True)   

search_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
ada002_deployment = os.getenv("AZURE_OPENAI_ADA002_EMBEDDING_DEPLOYMENT_NAME")
large3_deployment = os.getenv("AZURE_OPENAI_3_LARGE_EMBEDDING_DEPLOYMENT_NAME")
gpt_chat_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")

print(f"search_endpoint: {search_endpoint}")



search_endpoint: https://rag-innovator-search-svc.search.windows.net


## Prepare the dataset (optional)

In [28]:
# import pandas as pd

# # Read the JSON file
# faq_data = pd.read_json('data/Extracted_FAQ_v3.json')

# # Rename columns to lowercase and 'Num' to 'id'
# faq_data.columns = faq_data.columns.str.lower()
# faq_data.rename(columns={'num': 'id'}, inplace=True)

# # Add a new column 'title' combining 'sheet_name', 'category', and 'type'
# faq_data['title'] = faq_data['sheet_name'] + " " + faq_data['category'] + " " + faq_data['type']

# # Display the modified dataframe
# print(faq_data.head())

In [29]:
# import os
# import json
# from openai import AzureOpenAI
# from dotenv import load_dotenv
# load_dotenv()

# aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
# aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
# aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
# aoai_deployment_name = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
# if not aoai_api_version:
#     aoai_api_version = os.getenv("OPENAI_API_VERSION")
    
# try:
#     client = AzureOpenAI(
#         azure_endpoint = aoai_api_endpoint,
#         api_key        = aoai_api_key,
#         api_version    = aoai_api_version
#     )
# except (ValueError, TypeError) as e:
#     print(e)

In [30]:
# # Ensure the OpenAI client is already initialized as `client`

# # Function to translate text using OpenAI
# def translate_to_korean(text):
#     try:
#         response = client.chat.completions.create(
#             model=aoai_deployment_name,
#             messages=[
#                 {"role": "system", "content": "translate the following text to Korean."},
#                 {"role": "user", "content": text},
#             ],
#             temperature=0.3,
#             max_tokens=1000,
#         )
#         return response.choices[0].message.content.strip()
#     except Exception as e:
#         print(f"Error translating text: {e}")
#         return None

# # Add a new column 'question_ko' with Korean translations
# faq_data['title'] = faq_data['title'].apply(translate_to_korean)



In [31]:
# faq_data[['id', 'category', 'type', 'title', 'content']].to_json(
#     'data/rag_sample_data_ko.jsonl', 
#     orient='records', 
#     lines=True, 
#     force_ascii=False
# )

In [32]:
# faq_data[['id', 'category', 'type', 'question', 'answer']].to_json(
#     'data/rag_sample_qna_ko.jsonl', 
#     orient='records', 
#     lines=True, 
#     force_ascii=False
# )

In [33]:
# faq_data.to_json('data/rag_sample_data_ko.jsonl', orient='records', lines=True, force_ascii=False)

## Create embeddings
- Insert Azure AI Search indexes by reading data, creating OpenAI embeddings, and exporting to a valid format

In [34]:
MAX_RETRIES = 3
client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),        # Azure OpenAI base URL
    api_key        = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version    = os.getenv("AZURE_OPENAI_CHAT_API_VERSION"),
    max_retries    = MAX_RETRIES
)

In [35]:
!ls data/rag_sample_data_ko.jsonl -lh

-rw-r--r-- 1 azureuser azureuser 125K Mar 29 07:11 data/rag_sample_data_ko.jsonl


In [36]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
# with open("text-sample.json", "r", encoding="utf-8") as file:
#     input_data = json.load(file)

df_input_data = pd.read_json(os.path.join(os.getcwd(),'data/rag_sample_data_ko.jsonl'), lines=True)    

In [37]:
#pd.options.mode.chained_assignment = None #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters

# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_input_data['content']= df_input_data["content"].apply(lambda x : normalize_text(x))

- To take advantage of the Embedding API provided by Azure OpenAI, we check that the document does not have more than 8,192 tokens of text in the document

In [38]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df_input_data['n_tokens'] = df_input_data["content"].apply(lambda x: len(tokenizer.encode(x)))
df_input_data = df_input_data[df_input_data.n_tokens<8192]
len(df_input_data)
df_input_data

Unnamed: 0,id,category,type,title,content,n_tokens
0,1,General,Whats New,S24 Series General Whats New,"스타즈의 최신 갤러그 S24 시리즈는 최신 기술을 집약한 스마트폰으로, 다양한 AI...",748
1,2,Design,Color,S24 Series Design Color,"갤러그 S24 시리즈는 자연에서 영감을 받은 클래식한 컬러들로 구성되어 있어, 사용...",340
2,3,Design,Design,S24 Series Design Design,"갤러그 스마트폰은 독특한 시그니처 디자인으로 잘 알려져 있으며, 이는 브랜드의 정체...",349
3,4,Design,Color,S24 Series Design Color,해당 컬러는 현재 이 지역에서는 제공되지 않습니다. 이는 특정 색상이 지역별로 재고...,227
4,5,Design,Color,S24 Series Design Color,스타즈닷컴에서는 다양한 전용 컬러 옵션을 제공하여 소비자들이 개인의 취향에 맞는 제...,354
...,...,...,...,...,...,...
105,96,Display,Brightness,S24 FE Display Brightness,갤러그 S24 FE의 Dynamic AMOLED 2X 디스플레이는 최신 기술을 적용...,406
106,97,Display,Size,S24 FE Display Size,새로운 갤러그 S24 FE는 6.7인치 FHD+ 디스플레이를 탑재하여 사용자에게 선...,318
107,98,Galaxy Ecosystem,Link to Windows,S24 FE Galaxy Ecosystem Link to Windows,갤러그 스마트폰과 PC 간의 연속성 기능이 더욱 강화되었습니다. 이 기능은 사용자들...,402
108,99,Galaxy Ecosystem,Stars\nDeX,S24 FE Galaxy Ecosystem Stars\nDeX,Stars DeX는 외부 디스플레이와 스마트폰 간의 멀티태스킹을 가능하게 하여 사용...,429


- After completing the verification, delete the columns that are no longer needed, and save the data for insertion into Azure AI Search. 

In [39]:
df_input_data = df_input_data.drop('n_tokens', axis=1)

In [40]:
# Generate Document Embeddings using OpenAI Ada 002
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def get_embedding(text, model=ada002_deployment): # model = "deployment_name"
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [41]:
# model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
df_input_data['content_vector'] = df_input_data["content"].apply(lambda x : get_embedding (x, model = ada002_deployment)) 
df_input_data['content_vector_3'] = df_input_data["content"].apply(lambda x : get_embedding (x, model = large3_deployment)) 
df_input_data

Unnamed: 0,id,category,type,title,content,content_vector,content_vector_3
0,1,General,Whats New,S24 Series General Whats New,"스타즈의 최신 갤러그 S24 시리즈는 최신 기술을 집약한 스마트폰으로, 다양한 AI...","[-0.011637956835329533, 0.00048602258902974427...","[-0.01222926378250122, 0.0009613497531972826, ..."
1,2,Design,Color,S24 Series Design Color,"갤러그 S24 시리즈는 자연에서 영감을 받은 클래식한 컬러들로 구성되어 있어, 사용...","[0.007965372875332832, 0.005753505975008011, 0...","[-0.013997545465826988, -0.018798841163516045,..."
2,3,Design,Design,S24 Series Design Design,"갤러그 스마트폰은 독특한 시그니처 디자인으로 잘 알려져 있으며, 이는 브랜드의 정체...","[-0.011664549820125103, 0.00046807320904918015...","[0.00952054001390934, -0.012812593951821327, -..."
3,4,Design,Color,S24 Series Design Color,해당 컬러는 현재 이 지역에서는 제공되지 않습니다. 이는 특정 색상이 지역별로 재고...,"[0.005915156099945307, -0.01991063915193081, 0...","[-0.016029220074415207, -0.05892958119511604, ..."
4,5,Design,Color,S24 Series Design Color,스타즈닷컴에서는 다양한 전용 컬러 옵션을 제공하여 소비자들이 개인의 취향에 맞는 제...,"[0.010122701525688171, -0.014961718581616879, ...","[0.005488332826644182, -0.056681133806705475, ..."
...,...,...,...,...,...,...,...
105,96,Display,Brightness,S24 FE Display Brightness,갤러그 S24 FE의 Dynamic AMOLED 2X 디스플레이는 최신 기술을 적용...,"[0.009320260025560856, 0.01145191676914692, 0....","[0.008665835484862328, 0.0001571383763803169, ..."
106,97,Display,Size,S24 FE Display Size,새로운 갤러그 S24 FE는 6.7인치 FHD+ 디스플레이를 탑재하여 사용자에게 선...,"[0.01311912015080452, 0.026158811524510384, 0....","[-0.003202072810381651, 0.014558830298483372, ..."
107,98,Galaxy Ecosystem,Link to Windows,S24 FE Galaxy Ecosystem Link to Windows,갤러그 스마트폰과 PC 간의 연속성 기능이 더욱 강화되었습니다. 이 기능은 사용자들...,"[-0.009168842807412148, -0.012940751388669014,...","[0.002200562506914139, -0.019689880311489105, ..."
108,99,Galaxy Ecosystem,Stars\nDeX,S24 FE Galaxy Ecosystem Stars\nDeX,Stars DeX는 외부 디스플레이와 스마트폰 간의 멀티태스킹을 가능하게 하여 사용...,"[-0.006186055485159159, -0.006809015292674303,...","[0.03049645572900772, -0.019633416086435318, -..."


In [42]:
df_input_data.to_csv(os.path.join(os.getcwd(),'data/embedding_input_data.csv'), index=False)

## Create your search index
Create your search index schema and vector search configuration:

In [None]:
# Create a search index
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(key)

index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)
fields = [
        SimpleField(name="id", key=True, type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False),  
        SearchableField(name="category", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="type", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="title", type=SearchFieldDataType.String, filterable=True, facetable=True),
        SearchableField(name="content", type=SearchFieldDataType.String, filterable=True, facetable=True, analyzer_name="ko.microsoft"),
        SearchField(
            name="content_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="myHnswProfile",  # Ensure vector_search_profile is set
        ),
        SearchField(
            name="content_vector_3",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=3072,
            vector_search_profile_name="myHnswProfile",  # Ensure vector_search_profile is set
        )]
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=10,
                ef_construction=1000,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
            vectorizer="myOpenAI",  
        )
        
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            vectorizer_name="myOpenAI_vectorizer",  
            kind="azureOpenAI",  
            parameters=AzureOpenAIVectorizerParameters(  
                resource_url=os.getenv("AZURE_OPENAI_ENDPOINT"),  
                deployment_name=ada002_deployment,
                model_name=ada002_deployment,
                api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            ),
            
        ),  
    ],  
)  


    
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
        prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        content_fields=[SemanticField(field_name="content")]  
    ),  
)

# New scoring profile for keyword search
# https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.models.scoringprofile?view=azure-python
scoring_profiles = [  
    ScoringProfile(  
        name="my-scoring-profile",
        text_weights = TextWeights(
        	weights = {
                'category' : 1,
                'type' : 1,
                'title' : 3,
                'content' : 1
            }
        )
    ),
]


# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  

# Create a suggester that is a configuration in an index that specifies which fields should be used to populate autocomplete and suggested matches.
suggesters= [
    {
        "name": "sg",
        "searchMode": "analyzingInfixMatching",
        "sourceFields": ["title"]
    }
]

try:
    existing_index = index_client.get_index(index_name)
    print(f"index '{index_name}' exists. Proceed the index update.")

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search,suggesters=suggesters, scoring_profiles=scoring_profiles)
    result = index_client.create_or_update_index(index)
    print(f"index '{result.name}' updated.")
    

except Exception as e:
    print(f"index ( {index_name} ) does not exist. Create a new index.")

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search,suggesters=suggesters, scoring_profiles=scoring_profiles)
    result = index_client.create_or_update_index(index)
    print(f"index ( {result.name} ) creation completed.")
    

vectorizer is not a known attribute of class <class 'azure.search.documents.indexes._generated.models._models_py3.VectorSearchProfile'> and will be ignored
vectorizer is not a known attribute of class <class 'azure.search.documents.indexes._generated.models._models_py3.VectorSearchProfile'> and will be ignored


index 'optimize-test-index' exists. Proceed the index update.
index 'optimize-test-index' updated.


## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [44]:
# Upload data_embeddings.csv to search_client.upload_documents to create the index 
documents = df_input_data.to_dict(orient='records')
for doc in documents:
    doc['id'] = str(doc['id'])
search_client = SearchClient(
    endpoint=search_endpoint, index_name=index_name, credential=credential
)
result = search_client.upload_documents(documents)

print(f"Uploaded {len(documents)} documents")

Uploaded 110 documents
