# Prerequisites
본 `ipynb` 은 `Python=3.12` 에서 작성하였습니다. Package dependency 를 해결하기 위해 아래 cell 을 실행해주세요.

## Install Python packages

In [None]:
%pip -q install -U azure-identity azure-search-documents azure-ai-documentintelligence langchain langchain-community langchain-openai pandas

## Load environment variables from a .env file
secret 노출을 피하고 notebook 들간의 일관된 환경변수를 설정하기 위해 `dotenv` 을 이용한다.

In [None]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
AZURE_AI_SEARCH_ENDPOINT = os.getenv("AZURE_AI_SEARCH_ENDPOINT")
AZURE_AI_SEARCH_ADMIN_KEY = os.getenv("AZURE_AI_SEARCH_ADMIN_KEY")

# Retrieval engine
Azure 에서 제공하는 RAG 를 위한 retrieval engine 은 AI Search 이다. AI Search 는 확장적인 query pipeline 을 통해 강력한 검색 기능을 제공한다.

사용하는 data 는 아래와 같으며 임의로 상호 정보를 가진다.

In [None]:
import pandas as pd

data = pd.read_csv("./resources/KR_Merchants_Sample.csv")
data

## Keyword search
먼저 string 기반의 keyword search 를 알아보자.

### Create a index
merchants 을 묘사하는 index 를 선언한다.

In [None]:
index_name = "merchants-keyword"

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    ComplexField,
    SearchField,
    SearchFieldDataType,    
    SearchableField,
    SearchIndex, 
)

# index 를 정의한다.
fields = [
    SimpleField(name="MerchantId", type=SearchFieldDataType.String, key=True, filterable=True),
    SearchableField(name="MerchantName", type=SearchFieldDataType.String, sortable=True),
    SearchableField(name="Description", type=SearchFieldDataType.String),
    SearchableField(name="Category", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="Tags", type=SearchFieldDataType.Collection(SearchFieldDataType.String), searchable=True, filterable=True, facetable=True),
    SimpleField(name="Rating", type=SearchFieldDataType.Double, filterable=True, sortable=True, facetable=True),
    ComplexField(name="Address", fields=[
        SearchableField(name="StreetAddress", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="City", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="StateProvince", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="PostalCode", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="Country", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
    ]),
]

# index 를 생성한다.
index_client = SearchIndexClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
)
index = index_client.create_or_update_index(
    SearchIndex(name=index_name, fields=fields),
)
print(f'{index.name} created')

# search documents 를 생성한다.
documents = data.to_dict(orient="records")
for doc in documents:
    doc["Tags"] = [t.strip() for t in doc["Tag"].split(",") if t.strip()]
    doc["Address"] = {
        "StreetAddress": doc["StreetAddress"],
        "City": doc["City"],
        "StateProvince": doc["StateProvince"],
        "PostalCode": str(doc["PostalCode"]),
        "Country": doc["Country"],
    }
    for k in ["StreetAddress", "City", "StateProvince", "PostalCode", "Country", "Latitude", "Longitude", "LocationType", "Tag"]:
        if k in doc:
            del doc[k]

# index 에 document 를 업로드 한다.
search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
    index_name=index_name,
)
result = search_client.upload_documents(documents=documents)
for r in result:
    print(f"Key: {r.key}, Succeeded: {r.succeeded}, ErrorMessage: {r.error_message}")

### Process search query
준비된 search index 에 keyword search 를 보내보자.

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
    index_name=index_name,
)

result = search_client.search(
    search_text="예약",
    search_fields=["Tags"],
    select=["MerchantName", "Description", "Tags"],
    top=5,
)
print("검색어: 예약")
for doc in result:
    print(doc)

result = search_client.search(
    search_text="와인 파는 가게",
    select=["MerchantName", "Description"],
    top=5,
)
print("검색어: 와인 파는 가게")
for doc in result:
    print(doc)

result = search_client.search(
    search_text="*",
    filter="Rating gt 4",
    select=["MerchantName", "Description", "Rating"],
    order_by=["Rating desc"],
    top=5,
)
print("Rating 이 4 보다 높은 가게")
for doc in result:
    print(doc)

### Delete the index
사용하지 않는다면, index 는 삭제하자.

In [None]:
index_client.delete_index("merchants-keyword")

## Vector search
string 을 embedding 하여 vector 를 저장하여 활용한다.

### Create a index
위에 있었던 keyword 용 merchants-keyword index 에 vector 설정을 추가한다.

In [None]:
from openai import AzureOpenAI

index_name = "merchants-vector"
client = AzureOpenAI(
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
)

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    SimpleField,
    ComplexField,
    SearchField,
    SearchFieldDataType,    
    SearchableField,
    SearchIndex,
    VectorSearch, 
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    ExhaustiveKnnAlgorithmConfiguration    
)

# index 를 정의한다.
fields = [
    SimpleField(name="MerchantId", type=SearchFieldDataType.String, key=True, filterable=True),
    SearchableField(name="MerchantName", type=SearchFieldDataType.String, sortable=True),
    SearchableField(name="Description", type=SearchFieldDataType.String),
    SearchField(
        name="DescriptionVector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=3072,
        vector_search_profile_name="my-vector-profile"
    ),
    SearchableField(name="Category", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="Tags", type=SearchFieldDataType.Collection(SearchFieldDataType.String), searchable=True, filterable=True, facetable=True),
    SimpleField(name="Rating", type=SearchFieldDataType.Double, filterable=True, sortable=True, facetable=True),
    ComplexField(name="Address", fields=[
        SearchableField(name="StreetAddress", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="City", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="StateProvince", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="PostalCode", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="Country", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
    ]),
]
    
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(name="my-hnsw-vector-config-1", kind="hnsw"),
        ExhaustiveKnnAlgorithmConfiguration(name="my-eknn-vector-config", kind="exhaustiveKnn")
    ],
    profiles=[
        VectorSearchProfile(name="my-vector-profile", algorithm_configuration_name="my-hnsw-vector-config-1")
    ]
)

# index 를 생성한다.
index_client = SearchIndexClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
)
index = index_client.create_or_update_index(
    SearchIndex(name=index_name, fields=fields, vector_search=vector_search),
)
print(f'{index.name} created')

# search documents 를 생성한다.
documents = data.to_dict(orient="records")
for doc in documents:
    doc["Tags"] = [t.strip() for t in doc["Tag"].split(",") if t.strip()]
    doc["Address"] = {
        "StreetAddress": doc["StreetAddress"],
        "City": doc["City"],
        "StateProvince": doc["StateProvince"],
        "PostalCode": str(doc["PostalCode"]),
        "Country": doc["Country"],
    }
    for k in ["StreetAddress", "City", "StateProvince", "PostalCode", "Country", "Latitude", "Longitude", "LocationType", "Tag"]:
        if k in doc:
            del doc[k]
            
# Description 을 embedding 해서 DescriptionVector 필드에 추가한다.
response = client.embeddings.create(
    input=[doc["Description"] for doc in documents],
    model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
)
for item in response.data:
    documents[item.index]["DescriptionVector"] = item.embedding

# index 에 document 를 업로드 한다.
search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
    index_name=index_name,
)
result = search_client.upload_documents(documents=documents)
for r in result:
    print(f"Key: {r.key}, Succeeded: {r.succeeded}, ErrorMessage: {r.error_message}")

### Process search query
준비된 search index 에 vector search 를 보내보자.

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
    index_name=index_name,
)

result = search_client.search(
    vector_queries=[VectorizedQuery(
        vector=client.embeddings.create(
            input="예약 가능한 가게",
            model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        ).data[0].embedding,
        k_nearest_neighbors=5,
        fields="DescriptionVector",
        kind="vector",
    )],
    select=["MerchantName", "Description", "Tags"],
    top=5,
)
print("검색어: 예약 가능한 가게")
for doc in result:
    print(doc)

result = search_client.search(
    vector_queries=[VectorizedQuery(
        vector=client.embeddings.create(
            input="베지테리언",
            model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        ).data[0].embedding,
        k_nearest_neighbors=5,
        fields="DescriptionVector",
        kind="vector",
    )],
    select=["MerchantName", "Description"],
    top=5,
)
print("검색어: 배지터리언")
for doc in result:
    print(doc)

### Delete the index
사용하지 않는다면, index 는 삭제하자.

In [None]:
index_client.delete_index(index_name)

## Semantic search
AI Search 내 embedding 되어 있는 Semantic Model 을 활용한 검색 기능을 살펴보자.

### Create a index
merchants 을 묘사하는 index 를 선언한다.

In [None]:
index_name = "merchants-semantic"

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    SimpleField,
    ComplexField,
    SearchField,
    SearchFieldDataType,    
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    VectorSearch, 
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    ExhaustiveKnnAlgorithmConfiguration, 
)

# index 를 정의한다.
fields = [
    SimpleField(name="MerchantId", type=SearchFieldDataType.String, key=True, filterable=True),
    SearchableField(name="MerchantName", type=SearchFieldDataType.String, sortable=True),
    SearchableField(name="Description", type=SearchFieldDataType.String),
    SearchField(
        name="DescriptionVector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=3072,
        vector_search_profile_name="my-vector-profile"
    ),
    SearchableField(name="Category", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="Tags", type=SearchFieldDataType.Collection(SearchFieldDataType.String), searchable=True, filterable=True, facetable=True),
    SimpleField(name="Rating", type=SearchFieldDataType.Double, filterable=True, sortable=True, facetable=True),
    ComplexField(name="Address", fields=[
        SearchableField(name="StreetAddress", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="City", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="StateProvince", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="PostalCode", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
        SearchableField(name="Country", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=True),
    ]),
]
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(name="my-hnsw-vector-config-1", kind="hnsw"),
        ExhaustiveKnnAlgorithmConfiguration(name="my-eknn-vector-config", kind="exhaustiveKnn")
    ],
    profiles=[
        VectorSearchProfile(name="my-vector-profile", algorithm_configuration_name="my-hnsw-vector-config-1")
    ]
)
#! Semantic configuration 정의
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="MerchantName"), 
        content_fields=[SemanticField(field_name="Description")], 
        keywords_fields=[SemanticField(field_name="Tags")]
    )
)

# index 를 생성한다.
index_client = SearchIndexClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
)
index = index_client.create_or_update_index(
    SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_search=SemanticSearch(configurations=[semantic_config]),
    ),
)
print(f'{index.name} created')

# search documents 를 생성한다.
documents = data.to_dict(orient="records")
for doc in documents:
    doc["Tags"] = [t.strip() for t in doc["Tag"].split(",") if t.strip()]
    doc["Address"] = {
        "StreetAddress": doc["StreetAddress"],
        "City": doc["City"],
        "StateProvince": doc["StateProvince"],
        "PostalCode": str(doc["PostalCode"]),
        "Country": doc["Country"],
    }
    for k in ["StreetAddress", "City", "StateProvince", "PostalCode", "Country", "Latitude", "Longitude", "LocationType", "Tag"]:
        if k in doc:
            del doc[k]
            
# Description 을 embedding 해서 DescriptionVector 필드에 추가한다.
response = client.embeddings.create(
    input=[doc["Description"] for doc in documents],
    model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
)
for item in response.data:
    documents[item.index]["DescriptionVector"] = item.embedding

# index 에 document 를 업로드 한다.
search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
    index_name=index_name,
)
result = search_client.upload_documents(documents=documents)
for r in result:
    print(f"Key: {r.key}, Succeeded: {r.succeeded}, ErrorMessage: {r.error_message}")

### Process search query
준비된 search index 에 semantic search 를 보내보자.

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

search_client = SearchClient(
    endpoint=AZURE_AI_SEARCH_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SEARCH_ADMIN_KEY),
    index_name=index_name,
)

result = search_client.search(
    # vector_queries=[VectorizedQuery(
    #     vector=client.embeddings.create(
    #         input="예약 가능한 가게",
    #         model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
    #     ).data[0].embedding,
    #     k_nearest_neighbors=5,
    #     fields="DescriptionVector",
    #     kind="vector",
    # )],
    search_text="예약 가능한 가게",
    search_fields=["Tags"],
    select=["MerchantName", "Tags"],
    query_type="semantic",
    semantic_configuration_name="my-semantic-config",
    top=5,
)
print("검색어: 예약 가능한 가게")
for doc in result:
    print(doc)

result = search_client.search(
    vector_queries=[VectorizedQuery(
        vector=client.embeddings.create(
            input="베지테리언에게 좋은 가게",
            model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        ).data[0].embedding,
        k_nearest_neighbors=5,
        fields="DescriptionVector",
        kind="vector",
    )],
    search_text="베지테리언에게 좋은 가게",
    select=["MerchantName", "Description"],
    query_type="semantic",
    semantic_configuration_name="my-semantic-config",
    top=5,
)
print("검색어: 베지테리언에게 좋은 가게")
for doc in result:
    print(doc)

### Delete the index
사용하지 않는다면, index 는 삭제하자.

In [None]:
index_client.delete_index(index_name)